From 67090954cbbedc8c3790f66c1b32fc4a36ebe9ca Mon Sep 17 00:00:00 2001 From: Ritu Gala Date: Fri, 19 Apr 2024 13:03:30 -0400 Subject: [PATCH] Improving Transform and Rerank Module (#396) --- .gitignore | 3 +- examples/create_transform_data_example.py | 9 +- .../huggingface_datasets/dataset_index.json | 1 - .../reranking_dataset_index.json | 1 - .../huggingface_datasets_datafinder_index | Bin 7668967 -> 0 bytes .../description_dataset_retriever.py | 297 ++++++++++--- .../dataset_retriever/reranking_prompt.py | 205 ++++----- .../task_expansion_prompt.py | 26 ++ prompt2model/dataset_transformer/base.py | 1 - .../dataset_transformer/prompt_based.py | 257 +++++++---- .../dataset_transformer/prompt_template.py | 110 ++--- prompt2model/prompt_parser/instr_parser.py | 9 +- prompt2model/utils/api_tools.py | 33 +- prompt2model/utils/logging_utils.py | 16 +- prompt2model/utils/parse_responses.py | 147 +++--- prompt2model_demo.py | 22 +- .../dataset_index/retrieve_dataset_info.py | 2 +- .../reranking_dataset_index_tiny.json | 419 +++++++----------- tests/dataset_retriever_test.py | 115 ++--- tests/dataset_transformer_test.py | 37 +- 20 files changed, 920 insertions(+), 790 deletions(-) delete mode 100644 examples/huggingface_data/huggingface_datasets/dataset_index.json delete mode 100644 examples/huggingface_data/huggingface_datasets/reranking_dataset_index.json delete mode 100644 huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index create mode 100644 prompt2model/dataset_retriever/task_expansion_prompt.py diff --git a/.gitignore b/.gitignore index be4eca15f..74eb5b293 100644 --- a/.gitignore +++ b/.gitignore @@ -22,8 +22,9 @@ huggingface_data/huggingface_datasets/huggingface_datasets_datafinder_index huggingface_data/huggingface_datasets/reranking_dataset_index.json huggingface_data/huggingface_models/ retrieved_dataset_dict/ +result/ +checkpoint/ status.yaml - # Outputs generated by the colab demo trained_model/ trained_tokenizer/ diff --git a/examples/create_transform_data_example.py b/examples/create_transform_data_example.py index 4758948c1..ed33af765 100644 --- a/examples/create_transform_data_example.py +++ b/examples/create_transform_data_example.py @@ -33,12 +33,13 @@ # run this pipeline to retrieve relevant datasets, rerank them, # and transform them based on the prompt - retriever = DescriptionDatasetRetriever() - num_points_to_transform = 20 + total_num_points_to_transform = 20 + retriever = DescriptionDatasetRetriever( + auto_transform_data=True, + total_num_points_to_transform=total_num_points_to_transform, + ) retrieved_dataset_dict = retriever.retrieve_dataset_dict( prompt_spec, - auto_transform_data=True, - num_points_to_transform=num_points_to_transform, ) # save the final dataset to disk diff --git a/examples/huggingface_data/huggingface_datasets/dataset_index.json b/examples/huggingface_data/huggingface_datasets/dataset_index.json deleted file mode 100644 index 40d83175c..000000000 --- a/examples/huggingface_data/huggingface_datasets/dataset_index.json +++ /dev/null @@ -1 +0,0 @@ -{"acronym_identification": {"name": "acronym_identification", "description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.", "evaluation_metadata": [{"config": "default", "task": "token-classification", "task_id": "entity_extraction", "splits": {"eval_split": "test"}, "col_mapping": {"tokens": "tokens", "labels": "tags"}}]}, "ade_corpus_v2": {"name": "ade_corpus_v2", "description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.", "evaluation_metadata": [{"config": "Ade_corpus_v2_classification", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "adversarial_qa": {"name": "adversarial_qa", "description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.", "evaluation_metadata": [{"config": "adversarialQA", "task": "question-answering", "task_id": "extractive_question_answering", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "question", "context": "context", "answers": {"text": "text", "answer_start": "answer_start"}}, "metrics": [{"type": "squad", "name": "SQuAD"}]}]}, "aeslc": {"name": "aeslc", "description": "A collection of email messages of employees in the Enron Corporation.\n\nThere are two features:\n - email_body: email body text.\n - subject_line: email subject text.", "evaluation_metadata": {}}, "afrikaans_ner_corpus": {"name": "afrikaans_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "evaluation_metadata": {}}, "ag_news": {"name": "ag_news", "description": "AG is a collection of more than 1 million news articles. News articles have been\ngathered from more than 2000 news sources by ComeToMyHead in more than 1 year of\nactivity. ComeToMyHead is an academic news search engine which has been running\nsince July, 2004. The dataset is provided by the academic comunity for research\npurposes in data mining (clustering, classification, etc), information retrieval\n(ranking, search, etc), xml, data compression, data streaming, and any other\nnon-commercial activity. For more information, please refer to the link\nhttp://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .\n\nThe AG's news topic classification dataset is constructed by Xiang Zhang\n(xiang.zhang@nyu.edu) from the dataset above. It is used as a text\nclassification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann\nLeCun. Character-level Convolutional Networks for Text Classification. Advances\nin Neural Information Processing Systems 28 (NIPS 2015).", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "ai2_arc": {"name": "ai2_arc", "description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.", "evaluation_metadata": {}}, "air_dialogue": {"name": "air_dialogue", "description": "AirDialogue, is a large dataset that contains 402,038 goal-oriented conversations. To collect this dataset, we create a contextgenerator which provides travel and flight restrictions. Then the human annotators are asked to play the role of a customer or an agent and interact with the goal of successfully booking a trip given the restrictions.", "evaluation_metadata": {}}, "ajgt_twitter_ar": {"name": "ajgt_twitter_ar", "description": "Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.", "evaluation_metadata": {}}, "allegro_reviews": {"name": "allegro_reviews", "description": "Allegro Reviews is a sentiment analysis dataset, consisting of 11,588 product reviews written in Polish and extracted\nfrom Allegro.pl - a popular e-commerce marketplace. Each review contains at least 50 words and has a rating on a scale\nfrom one (negative review) to five (positive review).\n\nWe recommend using the provided train/dev/test split. The ratings for the test set reviews are kept hidden.\nYou can evaluate your model using the online evaluation tool available on klejbenchmark.com.", "evaluation_metadata": {}}, "allocine": {"name": "allocine", "description": " Allocine Dataset: A Large-Scale French Movie Reviews Dataset.\n This is a dataset for binary sentiment classification, made of user reviews scraped from Allocine.fr.\n It contains 100k positive and 100k negative reviews divided into 3 balanced splits: train (160k reviews), val (20k) and test (20k).", "evaluation_metadata": [{"config": "allocine", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"review": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "alt": {"name": "alt", "description": "The ALT project aims to advance the state-of-the-art Asian natural language processing (NLP) techniques through the open collaboration for developing and using ALT. It was first conducted by NICT and UCSY as described in Ye Kyaw Thu, Win Pa Pa, Masao Utiyama, Andrew Finch and Eiichiro Sumita (2016). Then, it was developed under ASEAN IVO as described in this Web page. The process of building ALT began with sampling about 20,000 sentences from English Wikinews, and then these sentences were translated into the other languages. ALT now has 13 languages: Bengali, English, Filipino, Hindi, Bahasa Indonesia, Japanese, Khmer, Lao, Malay, Myanmar (Burmese), Thai, Vietnamese, Chinese (Simplified Chinese).", "evaluation_metadata": {}}, "amazon_polarity": {"name": "amazon_polarity", "description": "The Amazon reviews dataset consists of reviews from amazon.\nThe data span a period of 18 years, including ~35 million reviews up to March 2013.\nReviews include product and user information, ratings, and a plaintext review.", "evaluation_metadata": [{"config": "amazon_polarity", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"content": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "amazon_reviews_multi": {"name": "amazon_reviews_multi", "description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\n\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\n\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.", "evaluation_metadata": {}}, "amazon_us_reviews": {"name": "amazon_us_reviews", "description": "Amazon Customer Reviews (a.k.a. Product Reviews) is one of Amazons iconic products. In a period of over two decades since the first review in 1995, millions of Amazon customers have contributed over a hundred million reviews to express opinions and describe their experiences regarding products on the Amazon.com website. This makes Amazon Customer Reviews a rich source of information for academic researchers in the fields of Natural Language Processing (NLP), Information Retrieval (IR), and Machine Learning (ML), amongst others. Accordingly, we are releasing this data to further research in multiple disciplines related to understanding customer product experiences. Specifically, this dataset was constructed to represent a sample of customer evaluations and opinions, variation in the perception of a product across geographical regions, and promotional intent or bias in reviews.\n\nOver 130+ million customer reviews are available to researchers as part of this release. The data is available in TSV files in the amazon-reviews-pds S3 bucket in AWS US East Region. Each line in the data files corresponds to an individual review (tab delimited, with no quote and escape characters).\n\nEach Dataset contains the following columns:\n\n- marketplace: 2 letter country code of the marketplace where the review was written.\n- customer_id: Random identifier that can be used to aggregate reviews written by a single author.\n- review_id: The unique ID of the review.\n- product_id: The unique Product ID the review pertains to. In the multilingual dataset the reviews for the same product in different countries can be grouped by the same product_id.\n- product_parent: Random identifier that can be used to aggregate reviews for the same product.\n- product_title: Title of the product.\n- product_category: Broad product category that can be used to group reviews (also used to group the dataset into coherent parts).\n- star_rating: The 1-5 star rating of the review.\n- helpful_votes: Number of helpful votes.\n- total_votes: Number of total votes the review received.\n- vine: Review was written as part of the Vine program.\n- verified_purchase: The review is on a verified purchase.\n- review_headline: The title of the review.\n- review_body: The review text.\n- review_date: The date the review was written.", "evaluation_metadata": {}}, "ambig_qa": {"name": "ambig_qa", "description": "AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous. The types of ambiguity are diverse and sometimes subtle, many of which are only apparent after examining evidence provided by a very large text corpus. AMBIGNQ, a dataset with\n14,042 annotations on NQ-OPEN questions containing diverse types of ambiguity.\nWe provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.", "evaluation_metadata": {}}, "americas_nli": {"name": "americas_nli", "description": "AmericasNLI is an extension of XNLI (Conneau et al., 2018) \u2013 a natural language inference (NLI) dataset covering 15 high-resource languages \u2013 to 10 low-resource indigenous languages spoken in the Americas: Ashaninka, Aymara, Bribri, Guarani, Nahuatl, Otomi, Quechua, Raramuri, Shipibo-Konibo, and Wixarika. As with MNLI, the goal is to predict textual entailment (does sentence A imply/contradict/neither sentence B) and is a classification task (given two sentences, predict one of three labels).", "evaluation_metadata": {}}, "ami": {"name": "ami", "description": "The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals\nsynchronized to a common timeline. These include close-talking and far-field microphones, individual and\nroom-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings,\nthe participants also have unsynchronized pens available to them that record what is written. The meetings\nwere recorded in English using three different rooms with different acoustic properties, and include mostly\nnon-native speakers. \\n", "evaluation_metadata": {}}, "amttl": {"name": "amttl", "description": "Chinese word segmentation (CWS) trained from open source corpus faces dramatic performance drop\nwhen dealing with domain text, especially for a domain with lots of special terms and diverse\nwriting styles, such as the biomedical domain. However, building domain-specific CWS requires\nextremely high annotation cost. In this paper, we propose an approach by exploiting domain-invariant\nknowledge from high resource to low resource domains. Extensive experiments show that our mode\nachieves consistently higher accuracy than the single-task CWS and other transfer learning\nbaselines, especially when there is a large disparity between source and target domains.\n\nThis dataset is the accompanied medical Chinese word segmentation (CWS) dataset.\nThe tags are in BIES scheme.\n\nFor more details see https://www.aclweb.org/anthology/C18-1307/", "evaluation_metadata": {}}, "anli": {"name": "anli", "description": "The Adversarial Natural Language Inference (ANLI) is a new large-scale NLI benchmark dataset,\nThe dataset is collected via an iterative, adversarial human-and-model-in-the-loop procedure.\nANLI is much more difficult than its predecessors including SNLI and MNLI.\nIt contains three rounds. Each round has train/dev/test splits.", "evaluation_metadata": {}}, "app_reviews": {"name": "app_reviews", "description": "It is a large dataset of Android applications belonging to 23 differentapps categories, which provides an overview of the types of feedback users report on the apps and documents the evolution of the related code metrics. The dataset contains about 395 applications of the F-Droid repository, including around 600 versions, 280,000 user reviews (extracted with specific text mining approaches)", "evaluation_metadata": {}}, "aqua_rat": {"name": "aqua_rat", "description": "A large-scale dataset consisting of approximately 100,000 algebraic word problems.\nThe solution to each question is explained step-by-step using natural language.\nThis data is used to train a program generation model that learns to generate the explanation,\nwhile generating the program that solves the question.", "evaluation_metadata": {}}, "aquamuse": {"name": "aquamuse", "description": "AQuaMuSe is a novel scalable approach to automatically mine dual query based multi-document summarization datasets for extractive and abstractive summaries using question answering dataset (Google Natural Questions) and large document corpora (Common Crawl)", "evaluation_metadata": {}}, "ar_res_reviews": {"name": "ar_res_reviews", "description": "Dataset of 8364 restaurant reviews scrapped from qaym.com in Arabic for sentiment analysis", "evaluation_metadata": {}}, "ar_sarcasm": {"name": "ar_sarcasm", "description": "ArSarcasm is a new Arabic sarcasm detection dataset.\nThe dataset was created using previously available Arabic sentiment analysis datasets (SemEval 2017 and ASTD)\n and adds sarcasm and dialect labels to them. The dataset contains 10,547 tweets, 1,682 (16%) of which are sarcastic.", "evaluation_metadata": {}}, "arabic_pos_dialect": {"name": "arabic_pos_dialect", "description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.", "evaluation_metadata": {}}, "arabic_speech_corpus": {"name": "arabic_speech_corpus", "description": "This Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton.\nThe corpus was recorded in south Levantine Arabic\n(Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "arcd": {"name": "arcd", "description": " Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles.", "evaluation_metadata": {}}, "art": {"name": "art", "description": "the Abductive Natural Language Inference Dataset from AI2", "evaluation_metadata": {}}, "ascent_kb": {"name": "ascent_kb", "description": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline (https://ascent.mpi-inf.mpg.de/).", "evaluation_metadata": {}}, "aslg_pc12": {"name": "aslg_pc12", "description": "A large synthetic collection of parallel English and ASL-Gloss texts.\nThere are two string features: text, and gloss.", "evaluation_metadata": {}}, "asnq": {"name": "asnq", "description": "ASNQ is a dataset for answer sentence selection derived from\nGoogle's Natural Questions (NQ) dataset (Kwiatkowski et al. 2019).\n\nEach example contains a question, candidate sentence, label indicating whether or not\nthe sentence answers the question, and two additional features --\nsentence_in_long_answer and short_answer_in_sentence indicating whether ot not the\ncandidate sentence is contained in the long_answer and if the short_answer is in the candidate sentence.\n\nFor more details please see\nhttps://arxiv.org/pdf/1911.04118.pdf\n\nand\n\nhttps://research.google/pubs/pub47761/", "evaluation_metadata": {}}, "assin": {"name": "assin", "description": "The ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.", "evaluation_metadata": {}}, "assin2": {"name": "assin2", "description": "The ASSIN 2 corpus is composed of rather simple sentences. Following the procedures of SemEval 2014 Task 1.\nThe training and validation data are composed, respectively, of 6,500 and 500 sentence pairs in Brazilian Portuguese,\nannotated for entailment and semantic similarity. Semantic similarity values range from 1 to 5, and text entailment\nclasses are either entailment or none. The test data are composed of approximately 3,000 sentence pairs with the same\nannotation. All data were manually annotated.", "evaluation_metadata": {}}, "facebook/babi_qa": {"name": "facebook/babi_qa", "description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.", "evaluation_metadata": {}}, "banking77": {"name": "banking77", "description": "BANKING77 dataset provides a very fine-grained set of intents in a banking domain.\nIt comprises 13,083 customer service queries labeled with 77 intents.\nIt focuses on fine-grained single-domain intent detection.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "bbaw_egyptian": {"name": "bbaw_egyptian", "description": "This dataset comprises parallel sentences of hieroglyphic encodings, transcription and translation\nas used in the paper Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian\nHieroglyph. The data triples are extracted from the digital corpus of Egyptian texts compiled by\nthe project \"Strukturen und Transformationen des Wortschatzes der \u00e4gyptischen Sprache\".", "evaluation_metadata": {}}, "bbc_hindi_nli": {"name": "bbc_hindi_nli", "description": "This dataset is used to train models for Natural Language Inference Tasks in Low-Resource Languages like Hindi.", "evaluation_metadata": {}}, "bc2gm_corpus": {"name": "bc2gm_corpus", "description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\n\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\n\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll", "evaluation_metadata": {}}, "beans": {"name": "beans", "description": "Beans is a dataset of images of beans taken in the field using smartphone\ncameras. It consists of 3 classes: 2 disease classes and the healthy class.\nDiseases depicted include Angular Leaf Spot and Bean Rust. Data was annotated\nby experts from the National Crops Resources Research Institute (NaCRRI) in\nUganda and collected by the Makerere AI research lab.", "evaluation_metadata": {}}, "best2009": {"name": "best2009", "description": "`best2009` is a Thai word-tokenization dataset from encyclopedia, novels, news and articles by\n[NECTEC](https://www.nectec.or.th/) (148,995/2,252 lines of train/test). It was created for\n[BEST 2010: Word Tokenization Competition](https://thailang.nectec.or.th/archive/indexa290.html?q=node/10).\nThe test set answers are not provided publicly.", "evaluation_metadata": {}}, "bianet": {"name": "bianet", "description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M", "evaluation_metadata": {}}, "bible_para": {"name": "bible_para", "description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M", "evaluation_metadata": {}}, "big_patent": {"name": "big_patent", "description": "BIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.", "evaluation_metadata": {}}, "billsum": {"name": "billsum", "description": "BillSum, summarization of US Congressional and California state bills.\n\nThere are several features:\n - text: bill text.\n - summary: summary of the bills.\n - title: title of the bills.\nfeatures for us bills. ca bills does not have.\n - text_len: number of chars in text.\n - sum_len: number of chars in summary.", "evaluation_metadata": [{"config": "default", "task": "summarization", "task_id": "summarization", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "summary": "target"}, "metrics": [{"type": "rouge", "name": "Rouge"}]}]}, "bing_coronavirus_query_set": {"name": "bing_coronavirus_query_set", "description": "This dataset was curated from the Bing search logs (desktop users only) over the period of Jan 1st, 2020 \u2013 (Current Month - 1). Only searches that were issued many times by multiple users were included. The dataset includes queries from all over the world that had an intent related to the Coronavirus or Covid-19. In some cases this intent is explicit in the query itself (e.g., \u201cCoronavirus updates Seattle\u201d), in other cases it is implicit , e.g. \u201cShelter in place\u201d. The implicit intent of search queries (e.g., \u201cToilet paper\u201d) was extracted using random walks on the click graph as outlined in this paper by Microsoft Research. All personal data were removed.", "evaluation_metadata": {}}, "biomrc": {"name": "biomrc", "description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the previous BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the new dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating that the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is also higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new BERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or surpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different sizes, also releasing our code, and providing a leaderboard.", "evaluation_metadata": {}}, "biosses": {"name": "biosses", "description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation. The dataset comprises 100 sentence pairs, in which each sentence was selected from the TAC (Text Analysis Conference) Biomedical Summarization Track Training Dataset containing articles from the biomedical domain. The sentence pairs were evaluated by five different human experts that judged their similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).", "evaluation_metadata": {}}, "blbooks": {"name": "blbooks", "description": "A dataset comprising of text created by OCR from the 49,455 digitised books, equating to 65,227 volumes (25+ million pages), published between c. 1510 - c. 1900.\nThe books cover a wide range of subject areas including philosophy, history, poetry and literature.", "evaluation_metadata": {}}, "blbooksgenre": {"name": "blbooksgenre", "description": "This dataset contains metadata for resources belonging to the British Library\u2019s digitised printed books (18th-19th century) collection (bl.uk/collection-guides/digitised-printed-books).\nThis metadata has been extracted from British Library catalogue records.\nThe metadata held within our main catalogue is updated regularly.\nThis metadata dataset should be considered a snapshot of this metadata.", "evaluation_metadata": {}}, "blended_skill_talk": {"name": "blended_skill_talk", "description": "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.", "evaluation_metadata": {}}, "blimp": {"name": "blimp", "description": "BLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.", "evaluation_metadata": {}}, "blog_authorship_corpus": {"name": "blog_authorship_corpus", "description": "The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person.\n\nEach blog is presented as a separate file, the name of which indicates a blogger id# and the blogger\u2019s self-provided gender, age, industry and astrological sign. (All are labeled for gender and age but for many, industry and/or sign is marked as unknown.)\n\nAll bloggers included in the corpus fall into one of three age groups:\n- 8240 \"10s\" blogs (ages 13-17),\n- 8086 \"20s\" blogs (ages 23-27),\n- 2994 \"30s\" blogs (ages 33-47).\n\nFor each age group there are an equal number of male and female bloggers.\n\nEach blog in the corpus includes at least 200 occurrences of common English words. All formatting has been stripped with two exceptions. Individual posts within a single blogger are separated by the date of the following post and links within a post are denoted by the label urllink.\n\nThe corpus may be freely used for non-commercial research purposes.", "evaluation_metadata": {}}, "bn_hate_speech": {"name": "bn_hate_speech", "description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.", "evaluation_metadata": {}}, "bookcorpus": {"name": "bookcorpus", "description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. \\", "evaluation_metadata": {}}, "bookcorpusopen": {"name": "bookcorpusopen", "description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.\nThis version of bookcorpus has 17868 dataset items (books). Each item contains two fields: title and text. The title is the name of the book (just the file name) while text contains unprocessed book text. The bookcorpus has been prepared by Shawn Presser and is generously hosted by The-Eye. The-Eye is a non-profit, community driven platform dedicated to the archiving and long-term preservation of any and all data including but by no means limited to... websites, books, games, software, video, audio, other digital-obscura and ideas.", "evaluation_metadata": {}}, "boolq": {"name": "boolq", "description": "BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally\noccurring ---they are generated in unprompted and unconstrained settings.\nEach example is a triplet of (question, passage, answer), with the title of the page as optional additional context.\nThe text-pair classification setup is similar to existing natural language inference tasks.", "evaluation_metadata": {}}, "break_data": {"name": "break_data", "description": "Break is a human annotated dataset of natural language questions and their Question Decomposition Meaning Representations\n(QDMRs). Break consists of 83,978 examples sampled from 10 question answering datasets over text, images and databases.\nThis repository contains the Break dataset along with information on the exact data format.", "evaluation_metadata": {}}, "bsd_ja_en": {"name": "bsd_ja_en", "description": "This is the Business Scene Dialogue (BSD) dataset,\na Japanese-English parallel corpus containing written conversations\nin various business scenarios.\n\nThe dataset was constructed in 3 steps:\n 1) selecting business scenes,\n 2) writing monolingual conversation scenarios according to the selected scenes, and\n 3) translating the scenarios into the other language.\n\nHalf of the monolingual scenarios were written in Japanese\nand the other half were written in English.\n\nFields:\n- id: dialogue identifier\n- no: sentence pair number within a dialogue\n- en_speaker: speaker name in English\n- ja_speaker: speaker name in Japanese\n- en_sentence: sentence in English\n- ja_sentence: sentence in Japanese\n- original_language: language in which monolingual scenario was written\n- tag: scenario\n- title: scenario title", "evaluation_metadata": {}}, "bswac": {"name": "bswac", "description": "The Bosnian web corpus bsWaC was built by crawling the .ba top-level domain in 2014. The corpus was near-deduplicated on paragraph level, normalised via diacritic restoration, morphosyntactically annotated and lemmatised. The corpus is shuffled by paragraphs. Each paragraph contains metadata on the URL, domain and language identification (Bosnian vs. Croatian vs. Serbian).\n\nVersion 1.0 of this corpus is described in http://www.aclweb.org/anthology/W14-0405. Version 1.1 contains newer and better linguistic annotations.", "evaluation_metadata": {}}, "c3": {"name": "c3", "description": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations.\nWe present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text.", "evaluation_metadata": {}}, "c4": {"name": "c4", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.", "evaluation_metadata": {}}, "cail2018": {"name": "cail2018", "description": "In this paper, we introduce Chinese AI and Law challenge dataset (CAIL2018),\nthe first large-scale Chinese legal dataset for judgment prediction. CAIL contains more than 2.6 million\ncriminal cases published by the Supreme People's Court of China, which are several times larger than other\ndatasets in existing works on judgment prediction. Moreover, the annotations of judgment results are more\ndetailed and rich. It consists of applicable law articles, charges, and prison terms, which are expected\nto be inferred according to the fact descriptions of cases. For comparison, we implement several conventional\ntext classification baselines for judgment prediction and experimental results show that it is still a\nchallenge for current models to predict the judgment results of legal cases, especially on prison terms.\nTo help the researchers make improvements on legal judgment prediction.", "evaluation_metadata": {}}, "caner": {"name": "caner", "description": "Classical Arabic Named Entity Recognition corpus as a new corpus of tagged data that can be useful for handling the issues in recognition of Arabic named entities.", "evaluation_metadata": {}}, "capes": {"name": "capes", "description": "A parallel corpus of theses and dissertations abstracts in English and Portuguese were collected from the CAPES website (Coordena\u00e7\u00e3o de Aperfei\u00e7oamento de Pessoal de N\u00edvel Superior) - Brazil. The corpus is sentence aligned for all language pairs. Approximately 240,000 documents were collected and aligned using the Hunalign algorithm.", "evaluation_metadata": {}}, "casino": {"name": "casino", "description": "We provide a novel dataset (referred to as CaSiNo) of 1030 negotiation dialogues. Two participants take the role of campsite neighbors and negotiate for Food, Water, and Firewood packages, based on their individual preferences and requirements. This design keeps the task tractable, while still facilitating linguistically rich and personal conversations. This helps to overcome the limitations of prior negotiation datasets such as Deal or No Deal and Craigslist Bargain. Each dialogue consists of rich meta-data including participant demographics, personality, and their subjective evaluation of the negotiation in terms of satisfaction and opponent likeness.", "evaluation_metadata": {}}, "catalonia_independence": {"name": "catalonia_independence", "description": "This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.\n\nEach corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.", "evaluation_metadata": {}}, "cawac": {"name": "cawac", "description": "caWaC is a 780-million-token web corpus of Catalan built from the .cat top-level-domain in late 2013.", "evaluation_metadata": {}}, "cbt": {"name": "cbt", "description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.", "evaluation_metadata": {}}, "cc100": {"name": "cc100", "description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.", "evaluation_metadata": {}}, "cc_news": {"name": "cc_news", "description": "CC-News containing news articles from news sites all over the world The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. This version of the dataset has 708241 articles. It represents a small portion of English language subset of the CC-News dataset created using news-please(Hamborg et al.,2017) to collect and extract English language portion of CC-News.", "evaluation_metadata": {}}, "ccaligned_multilingual": {"name": "ccaligned_multilingual", "description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).", "evaluation_metadata": {}}, "cdsc": {"name": "cdsc", "description": "Polish CDSCorpus consists of 10K Polish sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish. The dataset was presented at ACL 2017. Please refer to the Wr\u00f3blewska and Krasnowska-Kiera\u015b (2017) for a detailed description of the resource.", "evaluation_metadata": {}}, "cdt": {"name": "cdt", "description": "The Cyberbullying Detection task was part of 2019 edition of PolEval competition. The goal is to predict if a given Twitter message contains a cyberbullying (harmful) content.", "evaluation_metadata": {}}, "cedr": {"name": "cedr", "description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.", "evaluation_metadata": {}}, "cfq": {"name": "cfq", "description": "The CFQ dataset (and it's splits) for measuring compositional generalization.\n\nSee https://arxiv.org/abs/1912.09713.pdf for background.\n\nExample usage:\ndata = datasets.load_dataset('cfq/mcd1')", "evaluation_metadata": {}}, "chr_en": {"name": "chr_en", "description": "ChrEn is a Cherokee-English parallel dataset to facilitate machine translation research between Cherokee and English.\nChrEn is extremely low-resource contains 14k sentence pairs in total, split in ways that facilitate both in-domain and out-of-domain evaluation.\nChrEn also contains 5k Cherokee monolingual data to enable semi-supervised learning.", "evaluation_metadata": {}}, "cifar10": {"name": "cifar10", "description": "The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images\nper class. There are 50000 training images and 10000 test images.", "evaluation_metadata": {}}, "cifar100": {"name": "cifar100", "description": "The CIFAR-100 dataset consists of 60000 32x32 colour images in 100 classes, with 600 images\nper class. There are 500 training images and 100 testing images per class. There are 50000 training images and 10000 test images. The 100 classes are grouped into 20 superclasses.\nThere are two labels per image - fine label (actual class) and coarse label (superclass).", "evaluation_metadata": {}}, "circa": {"name": "circa", "description": "The Circa (meaning \u2018approximately\u2019) dataset aims to help machine learning systems\nto solve the problem of interpreting indirect answers to polar questions.\n\nThe dataset contains pairs of yes/no questions and indirect answers, together with\nannotations for the interpretation of the answer. The data is collected in 10\ndifferent social conversational situations (eg. food preferences of a friend).\n\nNOTE: There might be missing labels in the dataset and we have replaced them with -1.\nThe original dataset contains no train/dev/test splits.", "evaluation_metadata": {}}, "civil_comments": {"name": "civil_comments", "description": "The comments in this dataset come from an archive of the Civil Comments\nplatform, a commenting plugin for independent news sites. These public comments\nwere created from 2015 - 2017 and appeared on approximately 50 English-language\nnews sites across the world. When Civil Comments shut down in 2017, they chose\nto make the public comments available in a lasting open archive to enable future\nresearch. The original data, published on figshare, includes the public comment\ntext, some associated metadata such as article IDs, timestamps and\ncommenter-generated \"civility\" labels, but does not include user ids. Jigsaw\nextended this dataset by adding additional labels for toxicity and identity\nmentions. This data set is an exact replica of the data released for the\nJigsaw Unintended Bias in Toxicity Classification Kaggle challenge. This\ndataset is released under CC0, as is the underlying comment text.", "evaluation_metadata": {}}, "clickbait_news_bg": {"name": "clickbait_news_bg", "description": "Dataset with clickbait and fake news in Bulgarian. Introduced for the Hack the Fake News 2017.", "evaluation_metadata": {}}, "climate_fever": {"name": "climate_fever", "description": "A dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change collected on the internet. Each claim is accompanied by five manually annotated evidence sentences retrieved from the English Wikipedia that support, refute or do not give enough information to validate the claim totalling in 7,675 claim-evidence pairs. The dataset features challenging claims that relate multiple facets and disputed cases of claims where both supporting and refuting evidence are present.", "evaluation_metadata": {}}, "clinc_oos": {"name": "clinc_oos", "description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".", "evaluation_metadata": {}}, "clue": {"name": "clue", "description": "CLUE, A Chinese Language Understanding Evaluation Benchmark\n(https://www.cluebenchmarks.com/) is a collection of resources for training,\nevaluating, and analyzing Chinese language understanding systems.", "evaluation_metadata": {}}, "cmu_hinglish_dog": {"name": "cmu_hinglish_dog", "description": "This is a collection of text conversations in Hinglish (code mixing between Hindi-English) and their corresponding English only versions. Can be used for Translating between the two.", "evaluation_metadata": {}}, "cnn_dailymail": {"name": "cnn_dailymail", "description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary", "evaluation_metadata": [{"config": "3.0.0", "task": "summarization", "task_id": "summarization", "splits": {"eval_split": "test"}, "col_mapping": {"article": "text", "highlights": "target"}}]}, "coarse_discourse": {"name": "coarse_discourse", "description": "dataset contains discourse annotation and relation on threads from reddit during 2016", "evaluation_metadata": {}}, "codah": {"name": "codah", "description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.", "evaluation_metadata": {}}, "code_search_net": {"name": "code_search_net", "description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.", "evaluation_metadata": {}}, "code_x_glue_cc_clone_detection_big_clone_bench": {"name": "code_x_glue_cc_clone_detection_big_clone_bench", "description": "Given two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.\nThe dataset we use is BigCloneBench and filtered following the paper Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree.", "evaluation_metadata": {}}, "code_x_glue_cc_clone_detection_poj104": {"name": "code_x_glue_cc_clone_detection_poj104", "description": "Given a code and a collection of candidates as the input, the task is to return Top K codes with the same semantic. Models are evaluated by MAP score.\nWe use POJ-104 dataset on this task.", "evaluation_metadata": {}}, "code_x_glue_cc_cloze_testing_all": {"name": "code_x_glue_cc_cloze_testing_all", "description": "Cloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "evaluation_metadata": {}}, "code_x_glue_cc_cloze_testing_maxmin": {"name": "code_x_glue_cc_cloze_testing_maxmin", "description": "Cloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "evaluation_metadata": {}}, "code_x_glue_cc_code_completion_line": {"name": "code_x_glue_cc_code_completion_line", "description": "Complete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "evaluation_metadata": {}}, "code_x_glue_cc_code_completion_token": {"name": "code_x_glue_cc_code_completion_token", "description": "Predict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.", "evaluation_metadata": {}}, "code_x_glue_cc_code_refinement": {"name": "code_x_glue_cc_code_refinement", "description": "We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "evaluation_metadata": {}}, "code_x_glue_cc_code_to_code_trans": {"name": "code_x_glue_cc_code_to_code_trans", "description": "The dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).\n We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.", "evaluation_metadata": {}}, "code_x_glue_cc_defect_detection": {"name": "code_x_glue_cc_defect_detection", "description": "Given a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code.\nThe dataset we use comes from the paper Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. We combine all projects and split 80%/10%/10% for training/dev/test.", "evaluation_metadata": {}}, "code_x_glue_tc_nl_code_search_adv": {"name": "code_x_glue_tc_nl_code_search_adv", "description": "The dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.", "evaluation_metadata": {}}, "code_x_glue_tc_text_to_code": {"name": "code_x_glue_tc_text_to_code", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "evaluation_metadata": {}}, "code_x_glue_tt_text_to_text": {"name": "code_x_glue_tt_text_to_text", "description": "The dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "evaluation_metadata": {}}, "com_qa": {"name": "com_qa", "description": "ComQA is a dataset of 11,214 questions, which were collected from WikiAnswers, a community question answering website.\nBy collecting questions from such a site we ensure that the information needs are ones of interest to actual users.\nMoreover, questions posed there are often cannot be answered by commercial search engines or QA technology, making them\nmore interesting for driving future research compared to those collected from an engine's query log. The dataset contains\nquestions with various challenging phenomena such as the need for temporal reasoning, comparison (e.g., comparatives,\nsuperlatives, ordinals), compositionality (multiple, possibly nested, subquestions with multiple entities), and\nunanswerable questions (e.g., Who was the first human being on Mars?). Through a large crowdsourcing effort, questions\nin ComQA are grouped into 4,834 paraphrase clusters that express the same information need. Each cluster is annotated\nwith its answer(s). ComQA answers come in the form of Wikipedia entities wherever possible. Wherever the answers are\ntemporal or measurable quantities, TIMEX3 and the International System of Units (SI) are used for normalization.", "evaluation_metadata": {}}, "common_gen": {"name": "common_gen", "description": "CommonGen is a constrained text generation task, associated with a benchmark dataset,\nto explicitly test machines for the ability of generative commonsense reasoning. Given\na set of common concepts; the task is to generate a coherent sentence describing an\neveryday scenario using these concepts.\n\nCommonGen is challenging because it inherently requires 1) relational reasoning using\nbackground commonsense knowledge, and 2) compositional generalization ability to work\non unseen concept combinations. Our dataset, constructed through a combination of\ncrowd-sourcing from AMT and existing caption corpora, consists of 30k concept-sets and\n50k sentences in total.", "evaluation_metadata": {}}, "common_language": {"name": "common_language", "description": "This dataset is composed of speech recordings from languages that were carefully selected from the CommonVoice database.\nThe total duration of audio recordings is 45.1 hours (i.e., 1 hour of material for each language).\nThe dataset has been extracted from CommonVoice to train language-id systems.", "evaluation_metadata": {}}, "common_voice": {"name": "common_voice", "description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.", "evaluation_metadata": {}}, "commonsense_qa": {"name": "commonsense_qa", "description": "CommonsenseQA is a new multiple-choice question answering dataset that requires different types of commonsense knowledge\nto predict the correct answers . It contains 12,102 questions with one correct answer and four distractor answers.\nThe dataset is provided in two major training/validation/testing set splits: \"Random split\" which is the main evaluation\nsplit, and \"Question token split\", see paper for details.", "evaluation_metadata": {}}, "competition_math": {"name": "competition_math", "description": "The Mathematics Aptitude Test of Heuristics (MATH) dataset consists of problems\nfrom mathematics competitions, including the AMC 10, AMC 12, AIME, and more.\nEach problem in MATH has a full step-by-step solution, which can be used to teach\nmodels to generate answer derivations and explanations.", "evaluation_metadata": {}}, "compguesswhat": {"name": "compguesswhat", "description": "CompGuessWhat?! is an instance of a multi-task framework for evaluating the quality of learned neural representations,\n in particular concerning attribute grounding. Use this dataset if you want to use the set of games whose reference\n scene is an image in VisualGenome. Visit the website for more details: https://compguesswhat.github.io", "evaluation_metadata": {}}, "conceptnet5": {"name": "conceptnet5", "description": "This dataset is designed to provide training data\r\nfor common sense relationships pulls together from various sources.\r\n\r\nThe dataset is multi-lingual. See langauge codes and language info\r\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\r\n\r\n\r\nThis dataset provides an interface for the conceptnet5 csv file, and\r\nsome (but not all) of the raw text data used to build conceptnet5:\r\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\r\n\r\nOne use of this dataset would be to learn to extract the conceptnet\r\nrelationship from the omcsnet sentences.\r\n\r\nConceptnet5 has 34,074,917 relationships. Of those relationships,\r\nthere are 2,176,099 surface text sentences related to those 2M\r\nentries.\r\n\r\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\r\n2,001,736 lines.\r\n\r\nOriginal downloads are available here\r\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\r\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\r\n\r\nThe omcsnet data comes with the following warning from the authors of\r\nthe above site: Remember: this data comes from various forms of\r\ncrowdsourcing. Sentences in these files are not necessarily true,\r\nuseful, or appropriate.", "evaluation_metadata": {}}, "conll2000": {"name": "conll2000", "description": " Text chunking consists of dividing a text in syntactically correlated parts of words. For example, the sentence\n He reckons the current account deficit will narrow to only # 1.8 billion in September . can be divided as follows:\n[NP He ] [VP reckons ] [NP the current account deficit ] [VP will narrow ] [PP to ] [NP only # 1.8 billion ]\n[PP in ] [NP September ] .\n\nText chunking is an intermediate step towards full parsing. It was the shared task for CoNLL-2000. Training and test\ndata for this task is available. This data consists of the same partitions of the Wall Street Journal corpus (WSJ)\nas the widely used data for noun phrase chunking: sections 15-18 as training data (211727 tokens) and section 20 as\ntest data (47377 tokens). The annotation of the data has been derived from the WSJ corpus by a program written by\nSabine Buchholz from Tilburg University, The Netherlands.", "evaluation_metadata": {}}, "conll2003": {"name": "conll2003", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": [{"config": "conll2003", "task": "token-classification", "task_id": "entity_extraction", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "tokens", "ner_tags": "tags"}, "metrics": [{"type": "seqeval", "name": "seqeval"}]}]}, "conllpp": {"name": "conllpp", "description": "CoNLLpp is a corrected version of the CoNLL2003 NER dataset where labels of 5.38% of the sentences in the test set\nhave been manually corrected. The training set and development set are included for completeness.\nFor more details see https://www.aclweb.org/anthology/D19-1519/ and https://github.com/ZihanWangKi/CrossWeigh", "evaluation_metadata": [{"config": "conllpp", "task": "token-classification", "task_id": "entity_extraction", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "tokens", "ner_tags": "tags"}, "metrics": [{"type": "seqeval", "name": "seqeval"}]}]}, "conv_ai": {"name": "conv_ai", "description": "ConvAI is a dataset of human-to-bot conversations labelled for quality. This data can be used to train a metric for evaluating dialogue systems. Moreover, it can be used in the development of chatbots themselves: it contains the information on the quality of utterances and entire dialogues, that can guide a dialogue system in search of better answers.", "evaluation_metadata": {}}, "conv_ai_2": {"name": "conv_ai_2", "description": "ConvAI is a dataset of human-to-bot conversations labelled for quality. This data can be used to train a metric for evaluating dialogue systems. Moreover, it can be used in the development of chatbots themselves: it contains the information on the quality of utterances and entire dialogues, that can guide a dialogue system in search of better answers.", "evaluation_metadata": {}}, "conv_ai_3": {"name": "conv_ai_3", "description": "The Conv AI 3 challenge is organized as part of the Search-oriented Conversational AI (SCAI) EMNLP workshop in 2020. The main aim of the conversational systems is to return an appropriate answer in response to the user requests. However, some user requests might be ambiguous. In Information Retrieval (IR) settings such a situation is handled mainly through the diversification of search result page. It is however much more challenging in dialogue settings. Hence, we aim to study the following situation for dialogue settings:\n- a user is asking an ambiguous question (where ambiguous question is a question to which one can return > 1 possible answers)\n- the system must identify that the question is ambiguous, and, instead of trying to answer it directly, ask a good clarifying question.", "evaluation_metadata": {}}, "conv_questions": {"name": "conv_questions", "description": "ConvQuestions is the first realistic benchmark for conversational question answering over knowledge graphs.\nIt contains 11,200 conversations which can be evaluated over Wikidata. The questions feature a variety of complex\nquestion phenomena like comparisons, aggregations, compositionality, and temporal reasoning.", "evaluation_metadata": {}}, "coqa": {"name": "coqa", "description": "CoQA: A Conversational Question Answering Challenge", "evaluation_metadata": {}}, "cornell_movie_dialog": {"name": "cornell_movie_dialog", "description": "This corpus contains a large metadata-rich collection of fictional conversations extracted from raw movie scripts:\n- 220,579 conversational exchanges between 10,292 pairs of movie characters\n- involves 9,035 characters from 617 movies\n- in total 304,713 utterances\n- movie metadata included:\n - genres\n - release year\n - IMDB rating\n - number of IMDB votes\n - IMDB rating\n- character metadata included:\n - gender (for 3,774 characters)\n - position on movie credits (3,321 characters)", "evaluation_metadata": {}}, "cos_e": {"name": "cos_e", "description": "Common Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.", "evaluation_metadata": {}}, "cosmos_qa": {"name": "cosmos_qa", "description": "Cosmos QA is a large-scale dataset of 35.6K problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. It focuses on reading between the lines over a diverse collection of people's everyday narratives, asking questions concerning on the likely causes or effects of events that require reasoning beyond the exact text spans in the context", "evaluation_metadata": {}}, "counter": {"name": "counter", "description": " The COrpus of Urdu News TExt Reuse (COUNTER) corpus contains 1200 documents with real examples of text reuse from the field of journalism. It has been manually annotated at document level with three levels of reuse: wholly derived, partially derived and non derived.", "evaluation_metadata": {}}, "covid_qa_castorini": {"name": "covid_qa_castorini", "description": "CovidQA is the beginnings of a question answering dataset specifically designed for COVID-19, built by hand from knowledge gathered from Kaggle's COVID-19 Open Research Dataset Challenge.", "evaluation_metadata": {}}, "covid_qa_deepset": {"name": "covid_qa_deepset", "description": "COVID-QA is a Question Answering dataset consisting of 2,019 question/answer pairs annotated by volunteer biomedical experts on scientific articles related to COVID-19.", "evaluation_metadata": {}}, "covid_tweets_japanese": {"name": "covid_tweets_japanese", "description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.", "evaluation_metadata": {}}, "cppe-5": {"name": "cppe-5", "description": "CPPE - 5 (Medical Personal Protective Equipment) is a new challenging dataset with the goal\nto allow the study of subordinate categorization of medical personal protective equipments,\nwhich is not possible with other popular data sets that focus on broad level categories.", "evaluation_metadata": {}}, "crawl_domain": {"name": "crawl_domain", "description": "Corpus of domain names scraped from Common Crawl and manually annotated to add word boundaries (e.g. \"commoncrawl\" to \"common crawl\"). Breaking domain names such as \"openresearch\" into component words \"open\" and \"research\" is important for applications such as Text-to-Speech synthesis and web search. Common Crawl is an open repository of web crawl data that can be accessed and analyzed by anyone. Specifically, we scraped the plaintext (WET) extracts for domain names from URLs that contained diverse letter casing (e.g. \"OpenBSD\"). Although in the previous example, segmentation is trivial using letter casing, this was not always the case (e.g. \"NASA\"), so we had to manually annotate the data. The dataset is stored as plaintext file where each line is an example of space separated segments of a domain name. The examples are stored in their original letter casing, but harder and more interesting examples can be generated by lowercasing the input first.", "evaluation_metadata": {}}, "crd3": {"name": "crd3", "description": "Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.", "evaluation_metadata": {}}, "crows_pairs": {"name": "crows_pairs", "description": "CrowS-Pairs, a challenge dataset for measuring the degree to which U.S. stereotypical biases present in the masked language models (MLMs).", "evaluation_metadata": {}}, "cs_restaurants": {"name": "cs_restaurants", "description": "This is a dataset for NLG in task-oriented spoken dialogue systems with Czech as the target language. It originated as\na translation of the English San Francisco Restaurants dataset by Wen et al. (2015).", "evaluation_metadata": {}}, "cuad": {"name": "cuad", "description": "Contract Understanding Atticus Dataset (CUAD) v1 is a corpus of more than 13,000 labels in 510\ncommercial legal contracts that have been manually labeled to identify 41 categories of important\nclauses that lawyers look for when reviewing contracts in connection with corporate transactions.", "evaluation_metadata": [{"config": "default", "task": "question-answering", "task_id": "extractive_question_answering", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"question": "question", "context": "context", "answers": {"text": "text", "answer_start": "answer_start"}}, "metrics": [{"type": "cuad", "name": "CUAD"}]}]}, "curiosity_dialogs": {"name": "curiosity_dialogs", "description": "This dataset contains 14K dialogs (181K utterances) where users and assistants converse about geographic topics like\ngeopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog\nacts, grounding to Wikipedia, and user reactions to messages.", "evaluation_metadata": {}}, "daily_dialog": {"name": "daily_dialog", "description": "We develop a high-quality multi-turn dialog dataset, DailyDialog, which is intriguing in several aspects.\nThe language is human-written and less noisy. The dialogues in the dataset reflect our daily communication way\nand cover various topics about our daily life. We also manually label the developed dataset with communication\nintention and emotion information. Then, we evaluate existing approaches on DailyDialog dataset and hope it\nbenefit the research field of dialog systems.", "evaluation_metadata": {}}, "dane": {"name": "dane", "description": "The DaNE dataset has been annotated with Named Entities for PER, ORG and LOC\nby the Alexandra Institute.\nIt is a reannotation of the UD-DDT (Universal Dependency - Danish Dependency Treebank)\nwhich has annotations for dependency parsing and part-of-speech (POS) tagging.\nThe Danish UD treebank (Johannsen et al., 2015, UD-DDT) is a conversion of\nthe Danish Dependency Treebank (Buch-Kromann et al. 2003) based on texts\nfrom Parole (Britt, 1998).", "evaluation_metadata": {}}, "danish_political_comments": {"name": "danish_political_comments", "description": "The dataset consists of 9008 sentences that are labelled with fine-grained polarity in the range from -2 to 2 (negative to postive). The quality of the fine-grained is not cross validated and is therefore subject to uncertainties; however, the simple polarity has been cross validated and therefore is considered to be more correct.", "evaluation_metadata": {}}, "dart": {"name": "dart", "description": "DART is a large and open-domain structured DAta Record to Text generation corpus with high-quality\nsentence annotations with each input being a set of entity-relation triples following a tree-structured ontology.\nIt consists of 82191 examples across different domains with each input being a semantic RDF triple set derived\nfrom data records in tables and the tree ontology of table schema, annotated with sentence description that\ncovers all facts in the triple set.\n\nDART is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/2007.02871", "evaluation_metadata": {}}, "datacommons_factcheck": {"name": "datacommons_factcheck", "description": "A dataset of fact checked claims by news media maintained by datacommons.org", "evaluation_metadata": {}}, "dbpedia_14": {"name": "dbpedia_14", "description": "The DBpedia ontology classification dataset is constructed by picking 14 non-overlapping classes\nfrom DBpedia 2014. They are listed in classes.txt. From each of thse 14 ontology classes, we\nrandomly choose 40,000 training samples and 5,000 testing samples. Therefore, the total size\nof the training dataset is 560,000 and testing dataset 70,000.\nThere are 3 columns in the dataset (same for train and test splits), corresponding to class index\n(1 to 14), title and content. The title and content are escaped using double quotes (\"), and any\ninternal double quote is escaped by 2 double quotes (\"\"). There are no new lines in title or content.", "evaluation_metadata": {}}, "dbrd": {"name": "dbrd", "description": "The Dutch Book Review Dataset (DBRD) contains over 110k book reviews of which 22k have associated binary sentiment polarity labels. It is intended as a benchmark for sentiment classification in Dutch and created due to a lack of annotated datasets in Dutch that are suitable for this task.", "evaluation_metadata": {}}, "deal_or_no_dialog": {"name": "deal_or_no_dialog", "description": "A large dataset of human-human negotiations on a multi-issue bargaining task, where agents who cannot observe each other\u2019s reward functions must reach anagreement (o a deal) via natural language dialogue.", "evaluation_metadata": {}}, "definite_pronoun_resolution": {"name": "definite_pronoun_resolution", "description": "Composed by 30 students from one of the author's undergraduate classes. These\nsentence pairs cover topics ranging from real events (e.g., Iran's plan to\nattack the Saudi ambassador to the U.S.) to events/characters in movies (e.g.,\nBatman) and purely imaginary situations, largely reflecting the pop culture as\nperceived by the American kids born in the early 90s. Each annotated example\nspans four lines: the first line contains the sentence, the second line contains\nthe target pronoun, the third line contains the two candidate antecedents, and\nthe fourth line contains the correct antecedent. If the target pronoun appears\nmore than once in the sentence, its first occurrence is the one to be resolved.", "evaluation_metadata": {}}, "dengue_filipino": {"name": "dengue_filipino", "description": " Benchmark dataset for low-resource multiclass classification, with 4,015 training, 500 testing, and 500 validation examples, each labeled as part of five classes. Each sample can be a part of multiple classes. Collected as tweets.", "evaluation_metadata": {}}, "dialog_re": {"name": "dialog_re", "description": "DialogRE is the first human-annotated dialogue based relation extraction (RE) dataset aiming\nto support the prediction of relation(s) between two arguments that appear in a dialogue.\nThe dataset annotates all occurrences of 36 possible relation types that exist between pairs\nof arguments in the 1,788 dialogues originating from the complete transcripts of Friends.", "evaluation_metadata": {}}, "disaster_response_messages": {"name": "disaster_response_messages", "description": "This dataset contains 30,000 messages drawn from events including an earthquake in Haiti in 2010, an earthquake in Chile in 2010, floods in Pakistan in 2010, super-storm Sandy in the U.S.A. in 2012, and news articles spanning a large number of years and 100s of different disasters.\nThe data has been encoded with 36 different categories related to disaster response and has been stripped of messages with sensitive information in their entirety.\nUpon release, this is the featured dataset of a new Udacity course on Data Science and the AI4ALL summer school and is especially utile for text analytics and natural language processing (NLP) tasks and models.\nThe input data in this job contains thousands of untranslated disaster-related messages and their English translations.", "evaluation_metadata": {}}, "discofuse": {"name": "discofuse", "description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.", "evaluation_metadata": {}}, "disfl_qa": {"name": "disfl_qa", "description": "Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\ndataset, where each question in the dev set is annotated to add a contextual disfluency using the paragraph as\na source of distractors.\n\nThe final dataset consists of ~12k (disfluent question, answer) pairs. Over 90% of the disfluencies are\ncorrections or restarts, making it a much harder test set for disfluency correction. Disfl-QA aims to fill a\nmajor gap between speech and NLP research community. We hope the dataset can serve as a benchmark dataset for\ntesting robustness of models against disfluent inputs.\n\nOur expriments reveal that the state-of-the-art models are brittle when subjected to disfluent inputs from\nDisfl-QA. Detailed experiments and analyses can be found in our paper.", "evaluation_metadata": {}}, "docred": {"name": "docred", "description": "Multiple entities in a document generally exhibit complex inter-sentence relations, and cannot be well handled by existing relation extraction (RE) methods that typically focus on extracting intra-sentence relations for single entity pairs. In order to accelerate the research on document-level RE, we introduce DocRED, a new dataset constructed from Wikipedia and Wikidata with three features:\n - DocRED annotates both named entities and relations, and is the largest human-annotated dataset for document-level RE from plain text.\n - DocRED requires reading multiple sentences in a document to extract entities and infer their relations by synthesizing all information of the document.\n - Along with the human-annotated data, we also offer large-scale distantly supervised data, which enables DocRED to be adopted for both supervised and weakly supervised scenarios.", "evaluation_metadata": {}}, "dream": {"name": "dream", "description": "DREAM is a multiple-choice Dialogue-based REAding comprehension exaMination dataset. In contrast to existing reading comprehension datasets, DREAM is the first to focus on in-depth multi-turn multi-party dialogue understanding.", "evaluation_metadata": {}}, "drop": {"name": "drop", "description": "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs.\n. DROP is a crowdsourced, adversarially-created, 96k-question benchmark, in which a system must resolve references in a\nquestion, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or\n sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was\n necessary for prior datasets.", "evaluation_metadata": {}}, "duorc": {"name": "duorc", "description": "DuoRC contains 186,089 unique question-answer pairs created from a collection of 7680 pairs of movie plots where each pair in the collection reflects two versions of the same movie.", "evaluation_metadata": {}}, "dutch_social": {"name": "dutch_social", "description": "The dataset contains around 271,342 tweets. The tweets are filtered via the official Twitter API to\ncontain tweets in Dutch language or by users who have specified their location information within Netherlands\ngeographical boundaries. Using natural language processing we have classified the tweets for their HISCO codes.\nIf the user has provided their location within Dutch boundaries, we have also classified them to their respective\nprovinces The objective of this dataset is to make research data available publicly in a FAIR (Findable, Accessible,\nInteroperable, Reusable) way. Twitter's Terms of Service Licensed under Attribution-NonCommercial 4.0 International\n(CC BY-NC 4.0) (2020-10-27)", "evaluation_metadata": {}}, "dyk": {"name": "dyk", "description": "The Did You Know (pol. Czy wiesz?) dataset consists of human-annotated question-answer pairs. The task is to predict if the answer is correct. We chose the negatives which have the largest token overlap with a question.", "evaluation_metadata": {}}, "e2e_nlg": {"name": "e2e_nlg", "description": "The E2E dataset is used for training end-to-end, data-driven natural language generation systems in the restaurant domain, which is ten times bigger than existing, frequently used datasets in this area.\nThe E2E dataset poses new challenges:\n(1) its human reference texts show more lexical richness and syntactic variation, including discourse phenomena;\n(2) generating from this set requires content selection. As such, learning from this dataset promises more natural, varied and less template-like system utterances.\n\nE2E is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/1706.09254", "evaluation_metadata": {}}, "e2e_nlg_cleaned": {"name": "e2e_nlg_cleaned", "description": "An update release of E2E NLG Challenge data with cleaned MRs and scripts, accompanying the following paper:\n\nOnd\u0159ej Du\u0161ek, David M. Howcroft, and Verena Rieser (2019): Semantic Noise Matters for Neural Natural Language Generation. In INLG, Tokyo, Japan.", "evaluation_metadata": {}}, "ecb": {"name": "ecb", "description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M", "evaluation_metadata": {}}, "ecthr_cases": {"name": "ecthr_cases", "description": "The ECtHR Cases dataset is designed for experimentation of neural judgment prediction and rationale extraction considering ECtHR cases.", "evaluation_metadata": {}}, "eitb_parcc": {"name": "eitb_parcc", "description": "EiTB-ParCC: Parallel Corpus of Comparable News. A Basque-Spanish parallel corpus provided by Vicomtech (https://www.vicomtech.org), extracted from comparable news produced by the Basque public broadcasting group Euskal Irrati Telebista.", "evaluation_metadata": {}}, "electricity_load_diagrams": {"name": "electricity_load_diagrams", "description": "This new dataset contains hourly kW electricity consumption time series of 370 Portuguese clients from 2011 to 2014.", "evaluation_metadata": {}}, "eli5_category": {"name": "eli5_category", "description": "The ELI5-Category dataset is a smaller but newer and categorized version of the original ELI5 dataset. After 2017, a tagging system was introduced to this subreddit so that the questions can be categorized into different topics according to their tags. Since the training and validation set is built by questions in different topics, the dataset is expected to alleviate the train/validation overlapping issue in the original ELI5 dataset.", "evaluation_metadata": {}}, "emea": {"name": "emea", "description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M", "evaluation_metadata": {}}, "dair-ai/emotion": {"name": "dair-ai/emotion", "description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "emotone_ar": {"name": "emotone_ar", "description": "Dataset of 10065 tweets in Arabic for Emotion detection in Arabic text", "evaluation_metadata": {}}, "empathetic_dialogues": {"name": "empathetic_dialogues", "description": "PyTorch original implementation of Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset", "evaluation_metadata": {}}, "enriched_web_nlg": {"name": "enriched_web_nlg", "description": "WebNLG is a valuable resource and benchmark for the Natural Language Generation (NLG) community. However, as other NLG benchmarks, it only consists of a collection of parallel raw representations and their corresponding textual realizations. This work aimed to provide intermediate representations of the data for the development and evaluation of popular tasks in the NLG pipeline architecture (Reiter and Dale, 2000), such as Discourse Ordering, Lexicalization, Aggregation and Referring Expression Generation.", "evaluation_metadata": {}}, "eraser_multi_rc": {"name": "eraser_multi_rc", "description": "Eraser Multi RC is a dataset for queries over multi-line passages, along with\nanswers and a rationalte. Each example in this dataset has the following 5 parts\n1. A Mutli-line Passage\n2. A Query about the passage\n3. An Answer to the query\n4. A Classification as to whether the answer is right or wrong\n5. An Explanation justifying the classification", "evaluation_metadata": {}}, "esnli": {"name": "esnli", "description": "The e-SNLI dataset extends the Stanford Natural Language Inference Dataset to\ninclude human-annotated natural language explanations of the entailment\nrelations.", "evaluation_metadata": {}}, "eth_py150_open": {"name": "eth_py150_open", "description": "A redistributable subset of the ETH Py150 corpus, introduced in the ICML 2020 paper 'Learning and Evaluating Contextual Embedding of Source Code'", "evaluation_metadata": {}}, "ethos": {"name": "ethos", "description": "ETHOS: onlinE haTe speecH detectiOn dataSet. This repository contains a dataset for hate speech\ndetection on social media platforms, called Ethos. There are two variations of the dataset:\n\nEthos_Dataset_Binary: contains 998 comments in the dataset alongside with a label\nabout hate speech presence or absence. 565 of them do not contain hate speech,\nwhile the rest of them, 433, contain.\n\nEthos_Dataset_Multi_Label: which contains 8 labels for the 433 comments with hate speech content.\nThese labels are violence (if it incites (1) or not (0) violence), directed_vs_general (if it is\ndirected to a person (1) or a group (0)), and 6 labels about the category of hate speech like,\ngender, race, national_origin, disability, religion and sexual_orientation.", "evaluation_metadata": {}}, "eu_regulatory_ir": {"name": "eu_regulatory_ir", "description": "EURegIR: Regulatory Compliance IR (EU/UK)", "evaluation_metadata": {}}, "eurlex": {"name": "eurlex", "description": "EURLEX57K contains 57k legislative documents in English from EUR-Lex portal, annotated with EUROVOC concepts.", "evaluation_metadata": {}}, "europa_eac_tm": {"name": "europa_eac_tm", "description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.", "evaluation_metadata": {}}, "europa_ecdc_tm": {"name": "europa_ecdc_tm", "description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "evaluation_metadata": {}}, "europarl_bilingual": {"name": "europarl_bilingual", "description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.", "evaluation_metadata": {}}, "event2Mind": {"name": "event2Mind", "description": "In Event2Mind, we explore the task of understanding stereotypical intents and reactions to events. Through crowdsourcing, we create a large corpus with 25,000 events and free-form descriptions of their intents and reactions, both of the event's subject and (potentially implied) other participants.", "evaluation_metadata": {}}, "evidence_infer_treatment": {"name": "evidence_infer_treatment", "description": "Data and code from our \"Inferring Which Medical Treatments Work from Reports of Clinical Trials\", NAACL 2019. This work concerns inferring the results reported in clinical trials from text.\n\nThe dataset consists of biomedical articles describing randomized control trials (RCTs) that compare multiple treatments. Each of these articles will have multiple questions, or 'prompts' associated with them. These prompts will ask about the relationship between an intervention and comparator with respect to an outcome, as reported in the trial. For example, a prompt may ask about the reported effects of aspirin as compared to placebo on the duration of headaches. For the sake of this task, we assume that a particular article will report that the intervention of interest either significantly increased, significantly decreased or had significant effect on the outcome, relative to the comparator.\n\nThe dataset could be used for automatic data extraction of the results of a given RCT. This would enable readers to discover the effectiveness of different treatments without needing to read the paper.", "evaluation_metadata": {}}, "exams": {"name": "exams", "description": "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations.\nIt consists of more than 24,000 high-quality high school exam questions in 16 languages,\ncovering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.", "evaluation_metadata": {}}, "factckbr": {"name": "factckbr", "description": "A dataset to study Fake News in Portuguese, presenting a supposedly false News along with their respective fact check and classification.\nThe data is collected from the ClaimReview, a structured data schema used by fact check agencies to share their results in search engines, enabling data collect in real time.\nThe FACTCK.BR dataset contains 1309 claims with its corresponding label.", "evaluation_metadata": {}}, "fake_news_english": {"name": "fake_news_english", "description": "Fake news has become a major societal issue and a technical challenge for social media companies to identify. This content is difficult to identify because the term \"fake news\" covers intentionally false, deceptive stories as well as factual errors, satire, and sometimes, stories that a person just does not like. Addressing the problem requires clear definitions and examples. In this work, we present a dataset of fake news and satire stories that are hand coded, verified, and, in the case of fake news, include rebutting stories. We also include a thematic content analysis of the articles, identifying major themes that include hyperbolic support or condemnation of a gure, conspiracy theories, racist themes, and discrediting of reliable sources. In addition to releasing this dataset for research use, we analyze it and show results based on language that are promising for classification purposes. Overall, our contribution of a dataset and initial analysis are designed to support future work by fake news researchers.", "evaluation_metadata": {}}, "fake_news_filipino": {"name": "fake_news_filipino", "description": " Low-Resource Fake News Detection Corpora in Filipino. The first of its kind. Contains 3,206 expertly-labeled news samples, half of which are real and half of which are fake.", "evaluation_metadata": {}}, "farsi_news": {"name": "farsi_news", "description": "Contains Farsi (Persian) datasets for Machine Learning tasks, particularly NLP.\nThese datasets have been extracted from the RSS feed of two Farsi news agency websites:\n\n- Hamshahri\n- RadioFarda", "evaluation_metadata": {}}, "fashion_mnist": {"name": "fashion_mnist", "description": "Fashion-MNIST is a dataset of Zalando's article images\u2014consisting of a training set of\n60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image,\nassociated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in\nreplacement for the original MNIST dataset for benchmarking machine learning algorithms.\nIt shares the same image size and structure of training and testing splits.", "evaluation_metadata": {}}, "few_rel": {"name": "few_rel", "description": "FewRel is a large-scale few-shot relation extraction dataset, which contains more than one hundred relations and tens of thousands of annotated instances cross different domains.", "evaluation_metadata": {}}, "financial_phrasebank": {"name": "financial_phrasebank", "description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.", "evaluation_metadata": {}}, "finer": {"name": "finer", "description": "The directory data contains a corpus of Finnish technology related news articles with a manually prepared\nnamed entity annotation (digitoday.2014.csv). The text material was extracted from the archives of Digitoday,\na Finnish online technology news source (www.digitoday.fi). The corpus consists of 953 articles\n(193,742 word tokens) with six named entity classes (organization, location, person, product, event, and date).\nThe corpus is available for research purposes and can be readily used for development of NER systems for Finnish.", "evaluation_metadata": {}}, "flue": {"name": "flue", "description": "FLUE is an evaluation setup for French NLP systems similar to the popular GLUE benchmark. The goal is to enable further reproducible experiments in the future and to share models and progress on the French language.", "evaluation_metadata": {}}, "freebase_qa": {"name": "freebase_qa", "description": "FreebaseQA is for open-domain factoid question answering (QA) tasks over structured knowledge bases, like Freebase The data set is generated by matching trivia-type question-answer pairs with subject-predicateobject triples in Freebase.", "evaluation_metadata": {}}, "gap": {"name": "gap", "description": "GAP is a gender-balanced dataset containing 8,908 coreference-labeled pairs of\n(ambiguous pronoun, antecedent name), sampled from Wikipedia and released by\nGoogle AI Language for the evaluation of coreference resolution in practical\napplications.", "evaluation_metadata": {}}, "gem": {"name": "gem", "description": "GEM is a benchmark environment for Natural Language Generation with a focus on its Evaluation,\nboth through human annotations and automated Metrics.\n\nGEM aims to:\n- measure NLG progress across 13 datasets spanning many NLG tasks and languages.\n- provide an in-depth analysis of data and models presented via data statements and challenge sets.\n- develop standards for evaluation of generated text using both automated and human metrics.\n\nIt is our goal to regularly update GEM and to encourage toward more inclusive practices in dataset development\nby extending existing data or developing datasets for additional languages.", "evaluation_metadata": {}}, "generated_reviews_enth": {"name": "generated_reviews_enth", "description": " `generated_reviews_enth`\n Generated product reviews dataset for machine translation quality prediction, part of [scb-mt-en-th-2020](https://arxiv.org/pdf/2007.03541.pdf)\n `generated_reviews_enth` is created as part of [scb-mt-en-th-2020](https://arxiv.org/pdf/2007.03541.pdf) for machine translation task.\n This dataset (referred to as `generated_reviews_yn` in [scb-mt-en-th-2020](https://arxiv.org/pdf/2007.03541.pdf)) are English product reviews\n generated by [CTRL](https://arxiv.org/abs/1909.05858), translated by Google Translate API and annotated as accepted or rejected (`correct`)\n based on fluency and adequacy of the translation by human annotators.\n This allows it to be used for English-to-Thai translation quality esitmation (binary label), machine translation, and sentiment analysis.", "evaluation_metadata": {}}, "generics_kb": {"name": "generics_kb", "description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.", "evaluation_metadata": {}}, "germaner": {"name": "germaner", "description": "GermaNER is a freely available statistical German Named Entity Tagger based on conditional random fields(CRF). The tagger is trained and evaluated on the NoSta-D Named Entity dataset, which was used in the GermEval 2014 for named entity recognition. The tagger comes close to the performance of the best (proprietary) system in the competition with 77% F-measure (this is the latest result; the one reported in the paper is 76%) test set performance on the four standard NER classes (PERson, LOCation, ORGanisation and OTHer).\n\nWe describe a range of features and their influence on German NER classification and provide a comparative evaluation and some analysis of the results. The software components, the training data and all data used for feature generation are distributed under permissive licenses, thus this tagger can be used in academic and commercial settings without restrictions or fees. The tagger is available as a command-line tool and as an Apache UIMA component.", "evaluation_metadata": {}}, "giga_fren": {"name": "giga_fren", "description": "Giga-word corpus for French-English from WMT2010 collected by Chris Callison-Burch\n2 languages, total number of files: 452\ntotal number of tokens: 1.43G\ntotal number of sentence fragments: 47.55M", "evaluation_metadata": {}}, "glucose": {"name": "glucose", "description": "When humans read or listen, they make implicit commonsense inferences that frame their understanding of what happened and why. As a step toward AI systems that can build similar mental models, we introduce GLUCOSE, a large-scale dataset of implicit commonsense causal knowledge, encoded as causal mini-theories about the world, each grounded in a narrative context.", "evaluation_metadata": {}}, "glue": {"name": "glue", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": [{"config": "cola", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "sst2", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "mrpc", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "qqp", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question1": "text1", "question2": "text2", "label": "target"}}, {"config": "stsb", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "mnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation_matched"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_mismatched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_matched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "qnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "text1", "sentence": "text2", "label": "target"}}, {"config": "rte", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "wnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}]}, "gnad10": {"name": "gnad10", "description": "This dataset is intended to advance topic classification for German texts. A classifier that is efffective in\nEnglish may not be effective in German dataset because it has a higher inflection and longer compound words.\nThe 10kGNAD dataset contains 10273 German news articles from an Austrian online newspaper categorized into\n9 categories. Article titles and text are concatenated together and authors are removed to avoid a keyword-like\nclassification on authors that write frequently about one category. This dataset can be used as a benchmark\nfor German topic classification.", "evaluation_metadata": {}}, "go_emotions": {"name": "go_emotions", "description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.", "evaluation_metadata": {}}, "gooaq": {"name": "gooaq", "description": "GooAQ is a large-scale dataset with a variety of answer types. This dataset contains over\n5 million questions and 3 million answers collected from Google. GooAQ questions are collected\nsemi-automatically from the Google search engine using its autocomplete feature. This results in\nnaturalistic questions of practical interest that are nonetheless short and expressed using simple\nlanguage. GooAQ answers are mined from Google's responses to our collected questions, specifically from\nthe answer boxes in the search results. This yields a rich space of answer types, containing both\ntextual answers (short and long) as well as more structured ones such as collections.", "evaluation_metadata": {}}, "google_wellformed_query": {"name": "google_wellformed_query", "description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.", "evaluation_metadata": {}}, "grail_qa": {"name": "grail_qa", "description": "Strongly Generalizable Question Answering (GrailQA) is a new large-scale, high-quality dataset for question answering on knowledge bases (KBQA) on Freebase with 64,331 questions annotated with both answers and corresponding logical forms in different syntax (i.e., SPARQL, S-expression, etc.). It can be used to test three levels of generalization in KBQA: i.i.d., compositional, and zero-shot.", "evaluation_metadata": {}}, "great_code": {"name": "great_code", "description": "The dataset for the variable-misuse task, described in the ICLR 2020 paper 'Global Relational Models of Source Code' [https://openreview.net/forum?id=B1lnbRNtwr]\n\nThis is the public version of the dataset used in that paper. The original, used to produce the graphs in the paper, could not be open-sourced due to licensing issues. See the public associated code repository [https://github.com/VHellendoorn/ICLR20-Great] for results produced from this dataset.\n\nThis dataset was generated synthetically from the corpus of Python code in the ETH Py150 Open dataset [https://github.com/google-research-datasets/eth_py150_open].", "evaluation_metadata": {}}, "greek_legal_code": {"name": "greek_legal_code", "description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.", "evaluation_metadata": {}}, "guardian_authorship": {"name": "guardian_authorship", "description": "A dataset cross-topic authorship attribution. The dataset is provided by Stamatatos 2013.\n1- The cross-topic scenarios are based on Table-4 in Stamatatos 2017 (Ex. cross_topic_1 => row 1:P S U&W ).\n2- The cross-genre scenarios are based on Table-5 in the same paper. (Ex. cross_genre_1 => row 1:B P S&U&W).\n\n3- The same-topic/genre scenario is created by grouping all the datasts as follows.\nFor ex., to use same_topic and split the data 60-40 use:\ntrain_ds = load_dataset('guardian_authorship', name=\"cross_topic_<<#>>\",\n split='train[:60%]+validation[:60%]+test[:60%]')\ntests_ds = load_dataset('guardian_authorship', name=\"cross_topic_<<#>>\",\n split='train[-40%:]+validation[-40%:]+test[-40%:]')\n\nIMPORTANT: train+validation+test[:60%] will generate the wrong splits because the data is imbalanced\n\n* See https://huggingface.co/docs/datasets/splits.html for detailed/more examples", "evaluation_metadata": {}}, "gutenberg_time": {"name": "gutenberg_time", "description": "A clean data resource containing all explicit time references in a dataset of 52,183 novels whose full text is available via Project Gutenberg.", "evaluation_metadata": {}}, "hans": {"name": "hans", "description": "The HANS dataset is an NLI evaluation set that tests specific hypotheses about invalid heuristics that NLI models are likely to learn.", "evaluation_metadata": {}}, "hard": {"name": "hard", "description": "This dataset contains 93700 hotel reviews in Arabic language.The hotel reviews were collected from Booking.com website during June/July 2016.The reviews are expressed in Modern Standard Arabic as well as dialectal Arabic.The following table summarize some tatistics on the HARD Dataset.", "evaluation_metadata": {}}, "harem": {"name": "harem", "description": "The HAREM is a Portuguese language corpus commonly used for Named Entity Recognition tasks. It includes about 93k words, from 129 different texts,\nfrom several genres, and language varieties. The split of this dataset version follows the division made by [1], where 7% HAREM\ndocuments are the validation set and the miniHAREM corpus (with about 65k words) is the test set. There are two versions of the dataset set,\na version that has a total of 10 different named entity classes (Person, Organization, Location, Value, Date, Title, Thing, Event,\nAbstraction, and Other) and a \"selective\" version with only 5 classes (Person, Organization, Location, Value, and Date).\n\nIt's important to note that the original version of the HAREM dataset has 2 levels of NER details, namely \"Category\" and \"Sub-type\".\nThe dataset version processed here ONLY USE the \"Category\" level of the original dataset.\n\n[1] Souza, F\u00e1bio, Rodrigo Nogueira, and Roberto Lotufo. \"BERTimbau: Pretrained BERT Models for Brazilian Portuguese.\" Brazilian Conference on Intelligent Systems. Springer, Cham, 2020.", "evaluation_metadata": {}}, "has_part": {"name": "has_part", "description": "This dataset is a new knowledge-base (KB) of hasPart relationships, extracted from a large corpus of generic statements. Complementary to other resources available, it is the first which is all three of: accurate (90% precision), salient (covers relationships a person may mention), and has high coverage of common terms (approximated as within a 10 year old\u2019s vocabulary), as well as having several times more hasPart entries than in the popular ontologies ConceptNet and WordNet. In addition, it contains information about quantifiers, argument modifiers, and links the entities to appropriate concepts in Wikipedia and WordNet.", "evaluation_metadata": {}}, "hate_speech18": {"name": "hate_speech18", "description": "These files contain text extracted from Stormfront, a white supremacist forum. A random set of\nforums posts have been sampled from several subforums and split into sentences. Those sentences\nhave been manually labelled as containing hate speech or not, according to certain annotation guidelines.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "hate_speech_filipino": {"name": "hate_speech_filipino", "description": " Contains 10k tweets (training set) that are labeled as hate speech or non-hate speech. Released with 4,232 validation and 4,232 testing samples. Collected during the 2016 Philippine Presidential Elections.", "evaluation_metadata": {}}, "hate_speech_offensive": {"name": "hate_speech_offensive", "description": "An annotated dataset for hate speech and offensive language detection on tweets.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train"}, "col_mapping": {"tweet": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "hate_speech_pl": {"name": "hate_speech_pl", "description": "HateSpeech corpus in the current version contains over 2000 posts crawled from public Polish web. They represent various types and degrees of offensive language, expressed toward minorities (eg. ethnical, racial). The data were annotated manually.", "evaluation_metadata": {}}, "hate_speech_portuguese": {"name": "hate_speech_portuguese", "description": "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate').", "evaluation_metadata": {}}, "hatexplain": {"name": "hatexplain", "description": "Hatexplain is the first benchmark hate speech dataset covering multiple aspects of the issue. Each post in the dataset is annotated from three different perspectives: the basic, commonly used 3-class classification (i.e., hate, offensive or normal), the target community (i.e., the community that has been the victim of hate speech/offensive speech in the post), and the rationales, i.e., the portions of the post on which their labelling decision (as hate, offensive or normal) is based.", "evaluation_metadata": {}}, "hausa_voa_ner": {"name": "hausa_voa_ner", "description": "The Hausa VOA NER dataset is a labeled dataset for named entity recognition in Hausa. The texts were obtained from\nHausa Voice of America News articles https://www.voahausa.com/ . We concentrate on\nfour types of named entities: persons [PER], locations [LOC], organizations [ORG], and dates & time [DATE].\n\nThe Hausa VOA NER data files contain 2 columns separated by a tab ('\\t'). Each word has been put on a separate line and\nthere is an empty line after each sentences i.e the CoNLL format. The first item on each line is a word, the second\nis the named entity tag. The named entity tags have the format I-TYPE which means that the word is inside a phrase\nof type TYPE. For every multi-word expression like 'New York', the first word gets a tag B-TYPE and the subsequent words\nhave tags I-TYPE, a word with tag O is not part of a phrase. The dataset is in the BIO tagging scheme.\n\nFor more details, see https://www.aclweb.org/anthology/2020.emnlp-main.204/", "evaluation_metadata": {}}, "hausa_voa_topics": {"name": "hausa_voa_topics", "description": "A collection of news article headlines in Hausa from VOA Hausa.\nEach headline is labeled with one of the following classes: Nigeria,\nAfrica, World, Health or Politics.\n\nThe dataset was presented in the paper:\nHedderich, Adelani, Zhu, Alabi, Markus, Klakow: Transfer Learning and\nDistant Supervision for Multilingual Transformer Models: A Study on\nAfrican Languages (EMNLP 2020).", "evaluation_metadata": {}}, "hda_nli_hindi": {"name": "hda_nli_hindi", "description": "This dataset is a recasted version of the Hindi Discourse Analysis Dataset used to train models for Natural Language Inference Tasks in Low-Resource Languages like Hindi.", "evaluation_metadata": {}}, "head_qa": {"name": "head_qa", "description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\n\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.", "evaluation_metadata": {}}, "health_fact": {"name": "health_fact", "description": "PUBHEALTH is a comprehensive dataset for explainable automated fact-checking of\npublic health claims. Each instance in the PUBHEALTH dataset has an associated\nveracity label (true, false, unproven, mixture). Furthermore each instance in the\ndataset has an explanation text field. The explanation is a justification for which\nthe claim has been assigned a particular veracity label.\n\nThe dataset was created to explore fact-checking of difficult to verify claims i.e.,\nthose which require expertise from outside of the journalistics domain, in this case\nbiomedical and public health expertise.\n\nIt was also created in response to the lack of fact-checking datasets which provide\ngold standard natural language explanations for verdicts/labels.\n\nNOTE: There are missing labels in the dataset and we have replaced them with -1.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"claim": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "hebrew_projectbenyehuda": {"name": "hebrew_projectbenyehuda", "description": "This repository contains a dump of thousands of public domain works in Hebrew, from Project Ben-Yehuda, in plaintext UTF-8 files, with and without diacritics (nikkud). The metadata (pseudocatalogue.csv) file is a list of titles, authors, genres, and file paths, to help you process the dump.\nAll these works are in the public domain, so you are free to make any use of them, and do not need to ask for permission.\nThere are 10078 files, 3181136 lines", "evaluation_metadata": {}}, "hebrew_sentiment": {"name": "hebrew_sentiment", "description": "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel\u2019s\npresident, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder,\n2013) to scrape all the comments to all of the president\u2019s posts in the period of June \u2013 August 2014,\nthe first three months of Rivlin\u2019s presidency.2 While the president\u2019s posts aimed at reconciling tensions\nand called for tolerance and empathy, the sentiment expressed in the comments to the president\u2019s posts\nwas polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his\npolicy. Of the 12,804 comments, 370 are neutral; 8,512 are positive, 3,922 negative.\n\nData Annotation: A trained researcher examined each comment and determined its sentiment value,\nwhere comments with an overall positive sentiment were assigned the value 1, comments with an overall\nnegative sentiment were assigned the value -1, and comments that are off-topic to the post\u2019s content\nwere assigned the value 0. We validated the coding scheme by asking a second trained researcher to\ncode the same data. There was substantial agreement between raters (N of agreements: 10623, N of\ndisagreements: 2105, Coehn\u2019s Kappa = 0.697, p = 0).", "evaluation_metadata": {}}, "hellaswag": {"name": "hellaswag", "description": "HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.", "evaluation_metadata": {}}, "cais/mmlu": {"name": "cais/mmlu", "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.", "evaluation_metadata": {}}, "hind_encorp": {"name": "hind_encorp", "description": "HindEnCorp parallel texts (sentence-aligned) come from the following sources:\nTides, which contains 50K sentence pairs taken mainly from news articles. This dataset was originally col- lected for the DARPA-TIDES surprise-language con- test in 2002, later refined at IIIT Hyderabad and provided for the NLP Tools Contest at ICON 2008 (Venkatapathy, 2008).\n\nCommentaries by Daniel Pipes contain 322 articles in English written by a journalist Daniel Pipes and translated into Hindi.\n\nEMILLE. This corpus (Baker et al., 2002) consists of three components: monolingual, parallel and annotated corpora. There are fourteen monolingual sub- corpora, including both written and (for some lan- guages) spoken data for fourteen South Asian lan- guages. The EMILLE monolingual corpora contain in total 92,799,000 words (including 2,627,000 words of transcribed spoken data for Bengali, Gujarati, Hindi, Punjabi and Urdu). The parallel corpus consists of 200,000 words of text in English and its accompanying translations into Hindi and other languages.\n\nSmaller datasets as collected by Bojar et al. (2010) include the corpus used at ACL 2005 (a subcorpus of EMILLE), a corpus of named entities from Wikipedia (crawled in 2009), and Agriculture domain parallel corpus.\n\ufffc\nFor the current release, we are extending the parallel corpus using these sources:\nIntercorp (\u010cerm\u00e1k and Rosen,2012) is a large multilingual parallel corpus of 32 languages including Hindi. The central language used for alignment is Czech. Intercorp\u2019s core texts amount to 202 million words. These core texts are most suitable for us because their sentence alignment is manually checked and therefore very reliable. They cover predominately short sto- ries and novels. There are seven Hindi texts in Inter- corp. Unfortunately, only for three of them the English translation is available; the other four are aligned only with Czech texts. The Hindi subcorpus of Intercorp contains 118,000 words in Hindi.\n\nTED talks 3 held in various languages, primarily English, are equipped with transcripts and these are translated into 102 languages. There are 179 talks for which Hindi translation is available.\n\nThe Indic multi-parallel corpus (Birch et al., 2011; Post et al., 2012) is a corpus of texts from Wikipedia translated from the respective Indian language into English by non-expert translators hired over Mechanical Turk. The quality is thus somewhat mixed in many respects starting from typesetting and punctuation over capi- talization, spelling, word choice to sentence structure. A little bit of control could be in principle obtained from the fact that every input sentence was translated 4 times. We used the 2012 release of the corpus.\n\nLaunchpad.net is a software collaboration platform that hosts many open-source projects and facilitates also collaborative localization of the tools. We downloaded all revisions of all the hosted projects and extracted the localization (.po) files.\n\nOther smaller datasets. This time, we added Wikipedia entities as crawled in 2013 (including any morphological variants of the named entitity that appears on the Hindi variant of the Wikipedia page) and words, word examples and quotes from the Shabdkosh online dictionary.", "evaluation_metadata": {}}, "hindi_discourse": {"name": "hindi_discourse", "description": "The Hindi Discourse Analysis dataset is a corpus for analyzing discourse modes present in its sentences.\nIt contains sentences from stories written by 11 famous authors from the 20th Century.\n4-5 stories by each author have been selected which were available in the public domain resulting\nin a collection of 53 stories. Most of these short stories were originally written in Hindi\nbut some of them were written in other Indian languages and later translated to Hindi.", "evaluation_metadata": {}}, "hkcancor": {"name": "hkcancor", "description": "The Hong Kong Cantonese Corpus (HKCanCor) comprise transcribed conversations\nrecorded between March 1997 and August 1998. It contains recordings of\nspontaneous speech (51 texts) and radio programmes (42 texts),\nwhich involve 2 to 4 speakers, with 1 text of monologue.\n\nIn total, the corpus contains around 230,000 Chinese words.\nThe text is word-segmented, annotated with part-of-speech (POS) tags and\nromanised Cantonese pronunciation.\n\nRomanisation scheme - Linguistic Society of Hong Kong (LSHK)\nPOS scheme - Peita-Fujitsu-Renmin Ribao (PRF) corpus (Duan et al., 2000),\n with extended tags for Cantonese-specific phenomena added by\n Luke and Wang (see original paper for details).", "evaluation_metadata": {}}, "hlgd": {"name": "hlgd", "description": "HLGD is a binary classification dataset consisting of 20,056 labeled news headlines pairs indicating\nwhether the two headlines describe the same underlying world event or not.", "evaluation_metadata": {}}, "hope_edi": {"name": "hope_edi", "description": "A Hope Speech dataset for Equality, Diversity and Inclusion (HopeEDI) containing user-generated comments from the social media platform YouTube with 28,451, 20,198 and 10,705 comments in English, Tamil and Malayalam, respectively, manually labelled as containing hope speech or not.", "evaluation_metadata": {}}, "hotpot_qa": {"name": "hotpot_qa", "description": "HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features:\n(1) the questions require finding and reasoning over multiple supporting documents to answer;\n(2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas;\n(3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions;\n(4) we offer a new type of factoid comparison questions to testQA systems\u2019 ability to extract relevant facts and perform necessary comparison.", "evaluation_metadata": {}}, "hover": {"name": "hover", "description": "HoVer is an open-domain, many-hop fact extraction and claim verification dataset built upon the Wikipedia corpus. The original 2-hop claims are adapted from question-answer pairs from HotpotQA. It is collected by a team of NLP researchers at UNC Chapel Hill and Verisk Analytics.", "evaluation_metadata": {}}, "hrenwac_para": {"name": "hrenwac_para", "description": "The hrenWaC corpus version 2.0 consists of parallel Croatian-English texts crawled from the .hr top-level domain for Croatia.\nThe corpus was built with Spidextor (https://github.com/abumatran/spidextor), a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext on the segment level is around 80% and on the word level around 84%.", "evaluation_metadata": {}}, "hrwac": {"name": "hrwac", "description": "The Croatian web corpus hrWaC was built by crawling the .hr top-level domain in 2011 and again in 2014. The corpus was near-deduplicated on paragraph level, normalised via diacritic restoration, morphosyntactically annotated and lemmatised. The corpus is shuffled by paragraphs. Each paragraph contains metadata on the URL, domain and language identification (Croatian vs. Serbian).\n\nVersion 2.0 of this corpus is described in http://www.aclweb.org/anthology/W14-0405. Version 2.1 contains newer and better linguistic annotations.", "evaluation_metadata": {}}, "humicroedit": {"name": "humicroedit", "description": "This new dataset is designed to assess the funniness of edited news headlines.", "evaluation_metadata": {}}, "hybrid_qa": {"name": "hybrid_qa", "description": "Existing question answering datasets focus on dealing with homogeneous information, based either only on text or KB/Table information alone. However, as human knowledge is distributed over heterogeneous forms, using homogeneous information alone might lead to severe coverage problems. To fill in the gap, we present HybridQA, a new large-scale question-answering dataset that requires reasoning on heterogeneous information. Each question is aligned with a Wikipedia table and multiple free-form corpora linked with the entities in the table. The questions are designed to aggregate both tabular information and text information, i.e., lack of either form would render the question unanswerable.", "evaluation_metadata": {}}, "hyperpartisan_news_detection": {"name": "hyperpartisan_news_detection", "description": "Hyperpartisan News Detection was a dataset created for PAN @ SemEval 2019 Task 4.\nGiven a news article text, decide whether it follows a hyperpartisan argumentation, i.e., whether it exhibits blind, prejudiced, or unreasoning allegiance to one party, faction, cause, or person.\n\nThere are 2 parts:\n- byarticle: Labeled through crowdsourcing on an article basis. The data contains only articles for which a consensus among the crowdsourcing workers existed.\n- bypublisher: Labeled by the overall bias of the publisher as provided by BuzzFeed journalists or MediaBiasFactCheck.com.", "evaluation_metadata": {}}, "iapp_wiki_qa_squad": {"name": "iapp_wiki_qa_squad", "description": "`iapp_wiki_qa_squad` is an extractive question answering dataset from Thai Wikipedia articles.\nIt is adapted from [the original iapp-wiki-qa-dataset](https://github.com/iapp-technology/iapp-wiki-qa-dataset)\nto [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) format, resulting in\n5761/742/739 questions from 1529/191/192 articles.", "evaluation_metadata": {}}, "id_clickbait": {"name": "id_clickbait", "description": "The CLICK-ID dataset is a collection of Indonesian news headlines that was collected from 12 local online news\npublishers; detikNews, Fimela, Kapanlagi, Kompas, Liputan6, Okezone, Posmetro-Medan, Republika, Sindonews, Tempo,\nTribunnews, and Wowkeren. This dataset is comprised of mainly two parts; (i) 46,119 raw article data, and (ii)\n15,000 clickbait annotated sample headlines. Annotation was conducted with 3 annotator examining each headline.\nJudgment were based only on the headline. The majority then is considered as the ground truth. In the annotated\nsample, our annotation shows 6,290 clickbait and 8,710 non-clickbait.", "evaluation_metadata": {}}, "id_nergrit_corpus": {"name": "id_nergrit_corpus", "description": "Nergrit Corpus is a dataset collection for Indonesian Named Entity Recognition, Statement Extraction, and Sentiment\nAnalysis. id_nergrit_corpus is the Named Entity Recognition of this dataset collection which contains 18 entities as\nfollow:\n 'CRD': Cardinal\n 'DAT': Date\n 'EVT': Event\n 'FAC': Facility\n 'GPE': Geopolitical Entity\n 'LAW': Law Entity (such as Undang-Undang)\n 'LOC': Location\n 'MON': Money\n 'NOR': Political Organization\n 'ORD': Ordinal\n 'ORG': Organization\n 'PER': Person\n 'PRC': Percent\n 'PRD': Product\n 'QTY': Quantity\n 'REG': Religion\n 'TIM': Time\n 'WOA': Work of Art\n 'LAN': Language", "evaluation_metadata": {}}, "id_newspapers_2018": {"name": "id_newspapers_2018", "description": "The dataset contains around 500K articles (136M of words) from 7 Indonesian newspapers: Detik, Kompas, Tempo,\nCNN Indonesia, Sindo, Republika and Poskota. The articles are dated between 1st January 2018 and 20th August 2018\n(with few exceptions dated earlier). The size of uncompressed 500K json files (newspapers-json.tgz) is around 2.2GB,\nand the cleaned uncompressed in a big text file (newspapers.txt.gz) is about 1GB. The original source in Google Drive\ncontains also a dataset in html format which include raw data (pictures, css, javascript, ...)\nfrom the online news website", "evaluation_metadata": {}}, "id_panl_bppt": {"name": "id_panl_bppt", "description": "Parallel Text Corpora for Multi-Domain Translation System created by BPPT (Indonesian Agency for the Assessment and\nApplication of Technology) for PAN Localization Project (A Regional Initiative to Develop Local Language Computing\nCapacity in Asia). The dataset contains around 24K sentences divided in 4 difference topics (Economic, international,\nScience and Technology and Sport).", "evaluation_metadata": {}}, "id_puisi": {"name": "id_puisi", "description": "Puisi (poem) is an Indonesian poetic form. The dataset contains 7223 Indonesian puisi with its title and author.", "evaluation_metadata": {}}, "igbo_monolingual": {"name": "igbo_monolingual", "description": "A dataset is a collection of Monolingual Igbo sentences.", "evaluation_metadata": {}}, "igbo_ner": {"name": "igbo_ner", "description": "Igbo Named Entity Recognition Dataset", "evaluation_metadata": {}}, "ilist": {"name": "ilist", "description": "This dataset is introduced in a task which aimed at identifying 5 closely-related languages of Indo-Aryan language family \u2013\nHindi (also known as Khari Boli), Braj Bhasha, Awadhi, Bhojpuri, and Magahi.", "evaluation_metadata": {}}, "imdb": {"name": "imdb", "description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.\\", "evaluation_metadata": [{"config": "plain_text", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy"}, {"name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "imdb_urdu_reviews": {"name": "imdb_urdu_reviews", "description": "Large Movie translated Urdu Reviews Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous\nbenchmark datasets. We provide a set of 40,000 highly polar movie reviews for training, and 10,000 for testing.\nTo increase the availability of sentiment analysis dataset for a low recourse language like Urdu,\nwe opted to use the already available IMDB Dataset. we have translated this dataset using google translator.\nThis is a binary classification dataset having two classes as positive and negative.\nThe reason behind using this dataset is high polarity for each class.\nIt contains 50k samples equally divided in two classes.", "evaluation_metadata": {}}, "indic_glue": {"name": "indic_glue", "description": " IndicGLUE is a natural language understanding benchmark for Indian languages. It contains a wide\n variety of tasks and covers 11 major Indian languages - as, bn, gu, hi, kn, ml, mr, or, pa, ta, te.", "evaluation_metadata": {}}, "indonli": {"name": "indonli", "description": " IndoNLI is the first human-elicited Natural Language Inference (NLI) dataset for Indonesian.\n IndoNLI is annotated by both crowd workers and experts. The expert-annotated data is used exclusively as a test set.\n It is designed to provide a challenging test-bed for Indonesian NLI by explicitly incorporating various linguistic phenomena such as numerical reasoning, structural changes, idioms, or temporal and spatial reasoning.", "evaluation_metadata": {}}, "indonlp/indonlu": {"name": "indonlp/indonlu", "description": "The IndoNLU benchmark is a collection of resources for training, evaluating, and analyzing natural language understanding systems for Bahasa Indonesia.", "evaluation_metadata": {}}, "interpress_news_category_tr": {"name": "interpress_news_category_tr", "description": "It is a Turkish news data set consisting of 273601 news in 17 categories, compiled from print media and news websites between 2010 and 2017 by the Interpress (https://www.interpress.com/) media monitoring company.", "evaluation_metadata": {}}, "interpress_news_category_tr_lite": {"name": "interpress_news_category_tr_lite", "description": "It is a Turkish news data set consisting of 273601 news in 10 categories, compiled from print media and news websites between 2010 and 2017 by the Interpress (https://www.interpress.com/) media monitoring company. It has been rearranged as easily separable and with fewer classes.", "evaluation_metadata": {}}, "irc_disentangle": {"name": "irc_disentangle", "description": "Disentangling conversations mixed together in a single stream of messages is\na difficult task, made harder by the lack of large manually annotated\ndatasets. This new dataset of 77,563 messages manually annotated with\nreply-structure graphs that both disentangle conversations and define\ninternal conversation structure. The dataset is 16 times larger than all\npreviously released datasets combined, the first to include adjudication of\nannotation disagreements, and the first to include context.", "evaluation_metadata": {}}, "isixhosa_ner_corpus": {"name": "isixhosa_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "evaluation_metadata": {}}, "isizulu_ner_corpus": {"name": "isizulu_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "evaluation_metadata": {}}, "iwslt2017": {"name": "iwslt2017", "description": "The IWSLT 2017 Multilingual Task addresses text translation, including zero-shot translation, with a single MT system across all directions including English, German, Dutch, Italian and Romanian. As unofficial task, conventional bilingual text translation is offered between English and Arabic, French, Japanese, Chinese, German and Korean.", "evaluation_metadata": {}}, "jeopardy": {"name": "jeopardy", "description": "Dataset containing 216,930 Jeopardy questions, answers and other data.\n\nThe json file is an unordered list of questions where each question has\n'category' : the question category, e.g. \"HISTORY\"\n'value' : integer $ value of the question as string, e.g. \"200\"\nNote: This is \"None\" for Final Jeopardy! and Tiebreaker questions\n'question' : text of question\nNote: This sometimes contains hyperlinks and other things messy text such as when there's a picture or video question\n'answer' : text of answer\n'round' : one of \"Jeopardy!\",\"Double Jeopardy!\",\"Final Jeopardy!\" or \"Tiebreaker\"\nNote: Tiebreaker questions do happen but they're very rare (like once every 20 years)\n'show_number' : int of show number, e.g '4680'\n'air_date' : string of the show air date in format YYYY-MM-DD", "evaluation_metadata": {}}, "jnlpba": {"name": "jnlpba", "description": "The data came from the GENIA version 3.02 corpus (Kim et al., 2003). This was formed from a controlled search\non MEDLINE using the MeSH terms \u0018human\u0019, \u0018blood cells\u0019 and \u0018transcription factors\u0019. From this search 2,000 abstracts\nwere selected and hand annotated according to a small taxonomy of 48 classes based on a chemical classification.\nAmong the classes, 36 terminal classes were used to annotate the GENIA corpus.", "evaluation_metadata": {}}, "journalists_questions": {"name": "journalists_questions", "description": "\\\r\nThe journalists_questions corpus (version 1.0) is a collection of 10K human-written Arabic\r\ntweets manually labeled for question identification over Arabic tweets posted by journalists.", "evaluation_metadata": {}}, "kan_hope": {"name": "kan_hope", "description": "Numerous methods have been developed to monitor the spread of negativity in modern years by\neliminating vulgar, offensive, and fierce comments from social media platforms. However, there are relatively\nlesser amounts of study that converges on embracing positivity, reinforcing supportive and reassuring content in online forums.\nConsequently, we propose creating an English Kannada Hope speech dataset, KanHope and comparing several experiments to benchmark the dataset.\nThe dataset consists of 6,176 user generated comments in code mixed Kannada scraped from YouTube and manually annotated as bearing hope\nspeech or Not-hope speech.\nThis dataset was prepared for hope-speech text classification benchmark on code-mixed Kannada, an under-resourced language.", "evaluation_metadata": {}}, "kd_conv": {"name": "kd_conv", "description": "KdConv is a Chinese multi-domain Knowledge-driven Conversionsation dataset, grounding the topics in multi-turn conversations to knowledge graphs. KdConv contains 4.5K conversations from three domains (film, music, and travel), and 86K utterances with an average turn number of 19.0. These conversations contain in-depth discussions on related topics and natural transition between multiple topics, while the corpus can also used for exploration of transfer learning and domain adaptation.\\", "evaluation_metadata": {}}, "kde4": {"name": "kde4", "description": "A parallel corpus of KDE4 localization files (v.2).\n\n92 languages, 4,099 bitexts\ntotal number of files: 75,535\ntotal number of tokens: 60.75M\ntotal number of sentence fragments: 8.89M", "evaluation_metadata": {}}, "kelm": {"name": "kelm", "description": "Data-To-Text Generation involves converting knowledge graph (KG) triples of the form (subject, relation, object) into\na natural language sentence(s). This dataset consists of English KG data converted into paired natural language text.\nThe generated corpus consists of \u223c18M sentences spanning \u223c45M triples with \u223c1500 distinct relations.", "evaluation_metadata": {}}, "kilt_tasks": {"name": "kilt_tasks", "description": "KILT tasks training and evaluation data.\n- [FEVER](https://fever.ai) | Fact Checking | fever\n- [AIDA CoNLL-YAGO](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads) | Entity Linking | aidayago2\n- [WNED-WIKI](https://github.com/U-Alberta/wned) | Entity Linking | wned\n- [WNED-CWEB](https://github.com/U-Alberta/wned) | Entity Linking | cweb\n- [T-REx](https://hadyelsahar.github.io/t-rex) | Slot Filling | trex\n- [Zero-Shot RE](http://nlp.cs.washington.edu/zeroshot) | Slot Filling | structured_zeroshot\n- [Natural Questions](https://ai.google.com/research/NaturalQuestions) | Open Domain QA | nq\n- [HotpotQA](https://hotpotqa.github.io) | Open Domain QA | hotpotqa\n- [TriviaQA](http://nlp.cs.washington.edu/triviaqa) | Open Domain QA | triviaqa\n- [ELI5](https://facebookresearch.github.io/ELI5/explore.html) | Open Domain QA | eli5\n- [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia) | Dialogue | wow\n\nTo finish linking TriviaQA questions to the IDs provided, follow the instructions [here](http://github.com/huggingface/datasets/datasets/kilt_tasks/README.md).", "evaluation_metadata": {}}, "kilt_wikipedia": {"name": "kilt_wikipedia", "description": "KILT-Wikipedia: Wikipedia pre-processed for KILT.", "evaluation_metadata": {}}, "kinnews_kirnews": {"name": "kinnews_kirnews", "description": "Kinyarwanda and Kirundi news classification datasets", "evaluation_metadata": {}}, "klue": {"name": "klue", "description": "KLUE (Korean Language Understanding Evaluation)\nKorean Language Understanding Evaluation (KLUE) benchmark is a series of datasets to evaluate natural language\nunderstanding capability of Korean language models. KLUE consists of 8 diverse and representative tasks, which are accessible\nto anyone without any restrictions. With ethical considerations in mind, we deliberately design annotation guidelines to obtain\nunambiguous annotations for all datasets. Futhermore, we build an evaluation system and carefully choose evaluations metrics\nfor every task, thus establishing fair comparison across Korean language models.", "evaluation_metadata": {}}, "kor_3i4k": {"name": "kor_3i4k", "description": "This dataset is designed to identify speaker intention based on real-life spoken utterance in Korean into one of\n7 categories: fragment, description, question, command, rhetorical question, rhetorical command, utterances.", "evaluation_metadata": {}}, "kor_hate": {"name": "kor_hate", "description": "Human-annotated Korean corpus collected from a popular domestic entertainment news aggregation platform\nfor toxic speech detection. Comments are annotated for gender bias, social bias and hate speech.", "evaluation_metadata": {}}, "kor_nli": {"name": "kor_nli", "description": "Korean Natural Language Inference datasets", "evaluation_metadata": {}}, "kor_nlu": {"name": "kor_nlu", "description": " The dataset contains data for bechmarking korean models on NLI and STS", "evaluation_metadata": {}}, "kor_qpair": {"name": "kor_qpair", "description": "This is a Korean paired question dataset containing labels indicating whether two questions in a given pair are semantically identical. This dataset was used to evaluate the performance of [KoGPT2](https://github.com/SKT-AI/KoGPT2#subtask-evaluations) on a phrase detection downstream task.", "evaluation_metadata": {}}, "kor_sae": {"name": "kor_sae", "description": "This new dataset is designed to extract intent from non-canonical directives which will help dialog managers\nextract intent from user dialog that may have no clear objective or are paraphrased forms of utterances.", "evaluation_metadata": {}}, "kor_sarcasm": {"name": "kor_sarcasm", "description": "This is a dataset designed to detect sarcasm in Korean because it distorts the literal meaning of a sentence\nand is highly related to sentiment classification.", "evaluation_metadata": {}}, "labr": {"name": "labr", "description": "This dataset contains over 63,000 book reviews in Arabic.It is the largest sentiment analysis dataset for Arabic to-date.The book reviews were harvested from the website Goodreads during the month or March 2013.Each book review comes with the goodreads review id, the user id, the book id, the rating (1 to 5) and the text of the review.", "evaluation_metadata": {}}, "lama": {"name": "lama", "description": "LAMA is a dataset used to probe and analyze the factual and commonsense knowledge contained in pretrained language models. See https://github.com/facebookresearch/LAMA.", "evaluation_metadata": {}}, "lambada": {"name": "lambada", "description": "The LAMBADA evaluates the capabilities of computational models\nfor text understanding by means of a word prediction task.\nLAMBADA is a collection of narrative passages sharing the characteristic\nthat human subjects are able to guess their last word if\nthey are exposed to the whole passage, but not if they\nonly see the last sentence preceding the target word.\nTo succeed on LAMBADA, computational models cannot\nsimply rely on local context, but must be able to\nkeep track of information in the broader discourse.\n\nThe LAMBADA dataset is extracted from BookCorpus and\nconsists of 10'022 passages, divided into 4'869 development\nand 5'153 test passages. The training data for language\nmodels to be tested on LAMBADA include the full text\nof 2'662 novels (disjoint from those in dev+test),\ncomprising 203 million words.", "evaluation_metadata": {}}, "large_spanish_corpus": {"name": "large_spanish_corpus", "description": "The Large Spanish Corpus is a compilation of 15 unlabelled Spanish corpora spanning Wikipedia to European parliament notes. Each config contains the data corresponding to a different corpus. For example, \"all_wiki\" only includes examples from Spanish Wikipedia. By default, the config is set to \"combined\" which loads all the corpora; with this setting you can also specify the number of samples to return per corpus by configuring the \"split\" argument.", "evaluation_metadata": {}}, "laroseda": {"name": "laroseda", "description": " LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian, of which 7,500 are positive and 7,500 negative.\n Star ratings of 1 and 2 and of 4 and 5 are provided for negative and positive reviews respectively.\n The current dataset uses star rating as the label for multi-class classification.", "evaluation_metadata": {}}, "lc_quad": {"name": "lc_quad", "description": "LC-QuAD 2.0 is a Large Question Answering dataset with 30,000 pairs of question and its corresponding SPARQL query. The target knowledge base is Wikidata and DBpedia, specifically the 2018 version. Please see our paper for details about the dataset creation process and framework.", "evaluation_metadata": {}}, "lener_br": {"name": "lener_br", "description": "LeNER-Br is a Portuguese language dataset for named entity recognition\napplied to legal documents. LeNER-Br consists entirely of manually annotated\nlegislation and legal cases texts and contains tags for persons, locations,\ntime entities, organizations, legislation and legal cases.\nTo compose the dataset, 66 legal documents from several Brazilian Courts were\ncollected. Courts of superior and state levels were considered, such as Supremo\nTribunal Federal, Superior Tribunal de Justi\u00e7a, Tribunal de Justi\u00e7a de Minas\nGerais and Tribunal de Contas da Uni\u00e3o. In addition, four legislation documents\nwere collected, such as \"Lei Maria da Penha\", giving a total of 70 documents", "evaluation_metadata": {}}, "lex_glue": {"name": "lex_glue", "description": "Legal General Language Understanding Evaluation (LexGLUE) benchmark is\na collection of datasets for evaluating model performance across a diverse set of legal NLU tasks", "evaluation_metadata": {}}, "liar": {"name": "liar", "description": "LIAR is a dataset for fake news detection with 12.8K human labeled short statements from politifact.com's API, and each statement is evaluated by a politifact.com editor for its truthfulness. The distribution of labels in the LIAR dataset is relatively well-balanced: except for 1,050 pants-fire cases, the instances for all other labels range from 2,063 to 2,638. In each case, the labeler provides a lengthy analysis report to ground each judgment.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"statement": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "librispeech_asr": {"name": "librispeech_asr", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "librispeech_lm": {"name": "librispeech_lm", "description": "Language modeling resources to be used in conjunction with the LibriSpeech ASR corpus.", "evaluation_metadata": {}}, "limit": {"name": "limit", "description": "Motion recognition is one of the basic cognitive capabilities of many life forms, yet identifying motion of physical entities in natural language have not been explored extensively and empirically. Literal-Motion-in-Text (LiMiT) dataset, is a large human-annotated collection of English text sentences describing physical occurrence of motion, with annotated physical entities in motion.", "evaluation_metadata": {}}, "lince": {"name": "lince", "description": "LinCE is a centralized Linguistic Code-switching Evaluation benchmark\n(https://ritual.uh.edu/lince/) that contains data for training and evaluating\nNLP systems on code-switching tasks.", "evaluation_metadata": {}}, "linnaeus": {"name": "linnaeus", "description": "A novel corpus of full-text documents manually annotated for species mentions.", "evaluation_metadata": {}}, "liveqa": {"name": "liveqa", "description": "This is LiveQA, a Chinese dataset constructed from play-by-play live broadcast.\nIt contains 117k multiple-choice questions written by human commentators for over 1,670 NBA games,\nwhich are collected from the Chinese Hupu website.", "evaluation_metadata": {}}, "lj_speech": {"name": "lj_speech", "description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading\npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length\nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "lm1b": {"name": "lm1b", "description": "A benchmark corpus to be used for measuring progress in statistical language modeling. This has almost one billion words in the training data.", "evaluation_metadata": {}}, "m_lama": {"name": "m_lama", "description": "mLAMA: a multilingual version of the LAMA benchmark (T-REx and GoogleRE) covering 53 languages.", "evaluation_metadata": {}}, "mac_morpho": {"name": "mac_morpho", "description": "Mac-Morpho is a corpus of Brazilian Portuguese texts annotated with part-of-speech tags.\nIts first version was released in 2003 [1], and since then, two revisions have been made in order\nto improve the quality of the resource [2, 3].\nThe corpus is available for download split into train, development and test sections.\nThese are 76%, 4% and 20% of the corpus total, respectively (the reason for the unusual numbers\nis that the corpus was first split into 80%/20% train/test, and then 5% of the train section was\nset aside for development). This split was used in [3], and new POS tagging research with Mac-Morpho\nis encouraged to follow it in order to make consistent comparisons possible.\n\n\n[1] Alu\u00edsio, S., Pelizzoni, J., Marchi, A.R., de Oliveira, L., Manenti, R., Marquiaf\u00e1vel, V. 2003.\nAn account of the challenge of tagging a reference corpus for brazilian portuguese.\nIn: Proceedings of the 6th International Conference on Computational Processing of the Portuguese Language. PROPOR 2003\n\n[2] Fonseca, E.R., Rosa, J.L.G. 2013. Mac-morpho revisited: Towards robust part-of-speech.\nIn: Proceedings of the 9th Brazilian Symposium in Information and Human Language Technology \u2013 STIL\n\n[3] Fonseca, E.R., Alu\u00edsio, Sandra Maria, Rosa, J.L.G. 2015.\nEvaluating word embeddings and a revised corpus for part-of-speech tagging in Portuguese.\nJournal of the Brazilian Computer Society.", "evaluation_metadata": {}}, "makhzan": {"name": "makhzan", "description": "An Urdu text corpus for machine learning, natural language processing and linguistic analysis.", "evaluation_metadata": {}}, "masakhaner": {"name": "masakhaner", "description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811", "evaluation_metadata": {}}, "math_dataset": {"name": "math_dataset", "description": "Mathematics database.\n\nThis dataset code generates mathematical question and answer pairs,\nfrom a range of question types at roughly school-level difficulty.\nThis is designed to test the mathematical learning and algebraic\nreasoning skills of learning models.\n\nOriginal paper: Analysing Mathematical Reasoning Abilities of Neural Models\n(Saxton, Grefenstette, Hill, Kohli).\n\nExample usage:\ntrain_examples, val_examples = datasets.load_dataset(\n 'math_dataset/arithmetic__mul',\n split=['train', 'test'],\n as_supervised=True)", "evaluation_metadata": {}}, "math_qa": {"name": "math_qa", "description": "Our dataset is gathered by using a new representation language to annotate over the AQuA-RAT dataset. AQuA-RAT has provided the questions, options, rationale, and the correct options.", "evaluation_metadata": {}}, "mbpp": {"name": "mbpp", "description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been\nhand-verified by the authors.", "evaluation_metadata": {}}, "mc4": {"name": "mc4", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.", "evaluation_metadata": {}}, "mc_taco": {"name": "mc_taco", "description": "MC-TACO (Multiple Choice TemporAl COmmonsense) is a dataset of 13k question-answer\npairs that require temporal commonsense comprehension. A system receives a sentence\nproviding context information, a question designed to require temporal commonsense\nknowledge, and multiple candidate answers. More than one candidate answer can be plausible.\n\nThe task is framed as binary classification: givent he context, the question,\nand the candidate answer, the task is to determine whether the candidate\nanswer is plausible (\"yes\") or not (\"no\").", "evaluation_metadata": {}}, "md_gender_bias": {"name": "md_gender_bias", "description": "Machine learning models are trained to find patterns in data.\nNLP models can inadvertently learn socially undesirable patterns when training on gender biased text.\nIn this work, we propose a general framework that decomposes gender bias in text along several pragmatic and semantic dimensions:\nbias from the gender of the person being spoken about, bias from the gender of the person being spoken to, and bias from the gender of the speaker.\nUsing this fine-grained framework, we automatically annotate eight large scale datasets with gender information.\nIn addition, we collect a novel, crowdsourced evaluation benchmark of utterance-level gender rewrites.\nDistinguishing between gender bias along multiple dimensions is important, as it enables us to train finer-grained gender bias classifiers.\nWe show our classifiers prove valuable for a variety of important applications, such as controlling for gender bias in generative models,\ndetecting gender bias in arbitrary text, and shed light on offensive language in terms of genderedness.", "evaluation_metadata": {}}, "mdd": {"name": "mdd", "description": "The Movie Dialog dataset (MDD) is designed to measure how well\nmodels can perform at goal and non-goal orientated dialog\ncentered around the topic of movies (question answering,\nrecommendation and discussion).", "evaluation_metadata": {}}, "med_hop": {"name": "med_hop", "description": "MedHop is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.", "evaluation_metadata": {}}, "medal": {"name": "medal", "description": "A large medical text dataset (14Go) curated to 4Go for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. For example, DHF can be disambiguated to dihydrofolate, diastolic heart failure, dengue hemorragic fever or dihydroxyfumarate", "evaluation_metadata": {}}, "medical_dialog": {"name": "medical_dialog", "description": "The MedDialog dataset (English) contains conversations (in English) between doctors and patients.It has 0.26 million dialogues. The data is continuously growing and more dialogues will be added. The raw dialogues are from healthcaremagic.com and icliniq.com.\nAll copyrights of the data belong to healthcaremagic.com and icliniq.com.", "evaluation_metadata": {}}, "medical_questions_pairs": {"name": "medical_questions_pairs", "description": "This dataset consists of 3048 similar and dissimilar medical question pairs hand-generated and labeled by Curai's doctors.", "evaluation_metadata": {}}, "menyo20k_mt": {"name": "menyo20k_mt", "description": "MENYO-20k is a multi-domain parallel dataset with texts obtained from news articles, ted talks, movie transcripts, radio transcripts, science and technology texts, and other short articles curated from the web and professional translators. The dataset has 20,100 parallel sentences split into 10,070 training sentences, 3,397 development sentences, and 6,633 test sentences (3,419 multi-domain, 1,714 news domain, and 1,500 ted talks speech transcript domain). The development and test sets are available upon request.", "evaluation_metadata": {}}, "meta_woz": {"name": "meta_woz", "description": "MetaLWOz: A Dataset of Multi-Domain Dialogues for the Fast Adaptation of Conversation Models. We introduce the Meta-Learning Wizard of Oz (MetaLWOz) dialogue dataset for developing fast adaptation methods for conversation models. This data can be used to train task-oriented dialogue models, specifically to develop methods to quickly simulate user responses with a small amount of data. Such fast-adaptation models fall into the research areas of transfer learning and meta learning. The dataset consists of 37,884 crowdsourced dialogues recorded between two human users in a Wizard of Oz setup, in which one was instructed to behave like a bot, and the other a true human user. The users are assigned a task belonging to a particular domain, for example booking a reservation at a particular restaurant, and work together to complete the task. Our dataset spans 47 domains having 227 tasks total. Dialogues are a minimum of 10 turns long.", "evaluation_metadata": {}}, "metooma": {"name": "metooma", "description": "The dataset consists of tweets belonging to #MeToo movement on Twitter, labelled into different categories.\nDue to Twitter's development policies, we only provide the tweet ID's and corresponding labels,\nother data can be fetched via Twitter API.\nThe data has been labelled by experts, with the majority taken into the account for deciding the final label.\nWe provide these labels for each of the tweets. The labels provided for each data point\nincludes -- Relevance, Directed Hate, Generalized Hate,\nSarcasm, Allegation, Justification, Refutation, Support, Oppose", "evaluation_metadata": {}}, "metrec": {"name": "metrec", "description": "Arabic Poetry Metric Classification.\nThe dataset contains the verses and their corresponding meter classes.Meter classes are represented as numbers from 0 to 13. The dataset can be highly useful for further research in order to improve the field of Arabic poems\u2019 meter classification.The train dataset contains 47,124 records and the test dataset contains 8316 records.", "evaluation_metadata": {}}, "miam": {"name": "miam", "description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.", "evaluation_metadata": {}}, "mkb": {"name": "mkb", "description": "The Prime Minister's speeches - Mann Ki Baat, on All India Radio, translated into many languages.", "evaluation_metadata": {}}, "mkqa": {"name": "mkqa", "description": "We introduce MKQA, an open-domain question answering evaluation set comprising 10k question-answer pairs sampled from the Google Natural Questions dataset, aligned across 26 typologically diverse languages (260k question-answer pairs in total). For each query we collected new passage-independent answers. These queries and answers were then human translated into 25 Non-English languages.", "evaluation_metadata": {}}, "mlqa": {"name": "mlqa", "description": " MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance.\n MLQA consists of over 5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic,\n German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA is highly parallel, with QA instances parallel between\n 4 different languages on average.", "evaluation_metadata": {}}, "mlsum": {"name": "mlsum", "description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.", "evaluation_metadata": {}}, "mnist": {"name": "mnist", "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.", "evaluation_metadata": {}}, "mocha": {"name": "mocha", "description": "Posing reading comprehension as a generation problem provides a great deal of flexibility, allowing for open-ended questions with few restrictions on possible answers. However, progress is impeded by existing generation metrics, which rely on token overlap and are agnostic to the nuances of reading comprehension. To address this, we introduce a benchmark for training and evaluating generative reading comprehension metrics: MOdeling Correctness with Human Annotations. MOCHA contains 40K human judgement scores on model outputs from 6 diverse question answering datasets and an additional set of minimal pairs for evaluation. Using MOCHA, we train an evaluation metric: LERC, a Learned Evaluation metric for Reading Comprehension, to mimic human judgement scores.", "evaluation_metadata": {}}, "moroco": {"name": "moroco", "description": "The MOROCO (Moldavian and Romanian Dialectal Corpus) dataset contains 33564 samples of text collected from the news domain.\nThe samples belong to one of the following six topics:\n - culture\n - finance\n - politics\n - science\n - sports\n - tech", "evaluation_metadata": {}}, "movie_rationales": {"name": "movie_rationales", "description": "The movie rationale dataset contains human annotated rationales for movie\nreviews.", "evaluation_metadata": {}}, "mrqa": {"name": "mrqa", "description": "The MRQA 2019 Shared Task focuses on generalization in question answering.\nAn effective question answering system should do more than merely\ninterpolate from the training set to answer test examples drawn\nfrom the same distribution: it should also be able to extrapolate\nto out-of-distribution examples \u2014 a significantly harder challenge.\n\nThe dataset is a collection of 18 existing QA dataset (carefully selected\nsubset of them) and converted to the same format (SQuAD format). Among\nthese 18 datasets, six datasets were made available for training,\nsix datasets were made available for development, and the final six\nfor testing. The dataset is released as part of the MRQA 2019 Shared Task.", "evaluation_metadata": {}}, "ms_marco": {"name": "ms_marco", "description": "Starting with a paper released at NIPS 2016, MS MARCO is a collection of datasets focused on deep learning in search.\n\nThe first dataset was a question answering dataset featuring 100,000 real Bing questions and a human generated answer.\nSince then we released a 1,000,000 question dataset, a natural langauge generation dataset, a passage ranking dataset,\nkeyphrase extraction dataset, crawling dataset, and a conversational search.\n\nThere have been 277 submissions. 20 KeyPhrase Extraction submissions, 87 passage ranking submissions, 0 document ranking\nsubmissions, 73 QnA V2 submissions, 82 NLGEN submisions, and 15 QnA V1 submissions\n\nThis data comes in three tasks/forms: Original QnA dataset(v1.1), Question Answering(v2.1), Natural Language Generation(v2.1).\n\nThe original question answering datset featured 100,000 examples and was released in 2016. Leaderboard is now closed but data is availible below.\n\nThe current competitive tasks are Question Answering and Natural Language Generation. Question Answering features over 1,000,000 queries and\nis much like the original QnA dataset but bigger and with higher quality. The Natural Language Generation dataset features 180,000 examples and\nbuilds upon the QnA dataset to deliver answers that could be spoken by a smart speaker.", "evaluation_metadata": {}}, "msr_sqa": {"name": "msr_sqa", "description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.", "evaluation_metadata": {}}, "msra_ner": {"name": "msra_ner", "description": "The Third International Chinese Language\nProcessing Bakeoff was held in Spring\n2006 to assess the state of the art in two\nimportant tasks: word segmentation and\nnamed entity recognition. Twenty-nine\ngroups submitted result sets in the two\ntasks across two tracks and a total of five\ncorpora. We found strong results in both\ntasks as well as continuing challenges.\n\nMSRA NER is one of the provided dataset.\nThere are three types of NE, PER (person),\nORG (organization) and LOC (location).\nThe dataset is in the BIO scheme.\n\nFor more details see https://faculty.washington.edu/levow/papers/sighan06.pdf", "evaluation_metadata": [{"config": "msra_ner", "task": "token-classification", "task_id": "entity_extraction", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "tokens", "ner_tags": "tags"}, "metrics": [{"type": "seqeval", "name": "seqeval"}]}]}, "mt_eng_vietnamese": {"name": "mt_eng_vietnamese", "description": "Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese.", "evaluation_metadata": {}}, "muchocine": {"name": "muchocine", "description": "The Muchocine reviews dataset contains 3,872 longform movie reviews in Spanish language,\neach with a shorter summary review, and a rating on a 1-5 scale.", "evaluation_metadata": {}}, "multi_booked": {"name": "multi_booked", "description": "MultiBooked is a corpus of Basque and Catalan Hotel Reviews Annotated for Aspect-level Sentiment Classification.\n\nThe corpora are compiled from hotel reviews taken mainly from booking.com. The corpora are in Kaf/Naf format, which is\nan xml-style stand-off format that allows for multiple layers of annotation. Each review was sentence- and\nword-tokenized and lemmatized using Freeling for Catalan and ixa-pipes for Basque. Finally, for each language two\nannotators annotated opinion holders, opinion targets, and opinion expressions for each review, following the\nguidelines set out in the OpeNER project.", "evaluation_metadata": {}}, "multi_eurlex": {"name": "multi_eurlex", "description": "MultiEURLEX comprises 65k EU laws in 23 official EU languages (some low-ish resource).\nEach EU law has been annotated with EUROVOC concepts (labels) by the Publication Office of EU.\nAs with the English EURLEX, the goal is to predict the relevant EUROVOC concepts (labels);\nthis is multi-label classification task (given the text, predict multiple labels).", "evaluation_metadata": {}}, "multi_news": {"name": "multi_news", "description": "Multi-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.", "evaluation_metadata": [{"config": "default", "task": "summarization", "task_id": "summarization", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"document": "text", "summary": "target"}, "metrics": [{"type": "rouge", "name": "Rouge"}]}]}, "multi_nli": {"name": "multi_nli", "description": "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a\ncrowd-sourced collection of 433k sentence pairs annotated with textual\nentailment information. The corpus is modeled on the SNLI corpus, but differs in\nthat covers a range of genres of spoken and written text, and supports a\ndistinctive cross-genre generalization evaluation. The corpus served as the\nbasis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.", "evaluation_metadata": {}}, "multi_nli_mismatch": {"name": "multi_nli_mismatch", "description": "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a\ncrowd-sourced collection of 433k sentence pairs annotated with textual\nentailment information. The corpus is modeled on the SNLI corpus, but differs in\nthat covers a range of genres of spoken and written text, and supports a\ndistinctive cross-genre generalization evaluation. The corpus served as the\nbasis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.", "evaluation_metadata": {}}, "multi_para_crawl": {"name": "multi_para_crawl", "description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.", "evaluation_metadata": {}}, "multi_re_qa": {"name": "multi_re_qa", "description": "MultiReQA contains the sentence boundary annotation from eight publicly available QA datasets including SearchQA, TriviaQA, HotpotQA, NaturalQuestions, SQuAD, BioASQ, RelationExtraction, and TextbookQA. Five of these datasets, including SearchQA, TriviaQA, HotpotQA, NaturalQuestions, SQuAD, contain both training and test data, and three, including BioASQ, RelationExtraction, TextbookQA, contain only the test data", "evaluation_metadata": {}}, "multi_woz_v22": {"name": "multi_woz_v22", "description": "Multi-Domain Wizard-of-Oz dataset (MultiWOZ), a fully-labeled collection of human-human written conversations spanning over multiple domains and topics.\nMultiWOZ 2.1 (Eric et al., 2019) identified and fixed many erroneous annotations and user utterances in the original version, resulting in an\nimproved version of the dataset. MultiWOZ 2.2 is a yet another improved version of this dataset, which identifies and fizes dialogue state annotation errors\nacross 17.3% of the utterances on top of MultiWOZ 2.1 and redefines the ontology by disallowing vocabularies of slots with a large number of possible values\n(e.g., restaurant name, time of booking) and introducing standardized slot span annotations for these slots.", "evaluation_metadata": {}}, "multi_x_science_sum": {"name": "multi_x_science_sum", "description": "Multi-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references.", "evaluation_metadata": {}}, "multidoc2dial": {"name": "multidoc2dial", "description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents.", "evaluation_metadata": {}}, "mutual_friends": {"name": "mutual_friends", "description": "Our goal is to build systems that collaborate with people by exchanging\ninformation through natural language and reasoning over structured knowledge\nbase. In the MutualFriend task, two agents, A and B, each have a private\nknowledge base, which contains a list of friends with multiple attributes\n(e.g., name, school, major, etc.). The agents must chat with each other\nto find their unique mutual friend.", "evaluation_metadata": {}}, "myanmar_news": {"name": "myanmar_news", "description": "The Myanmar news dataset contains article snippets in four categories:\nBusiness, Entertainment, Politics, and Sport.\n\nThese were collected in October 2017 by Aye Hninn Khine", "evaluation_metadata": {}}, "natural_questions": {"name": "natural_questions", "description": "The NQ corpus contains questions from real users, and it requires QA systems to\nread and comprehend an entire Wikipedia article that may or may not contain the\nanswer to the question. The inclusion of real user questions, and the\nrequirement that solutions should read an entire page to find the answer, cause\nNQ to be a more realistic and challenging task than prior QA datasets.", "evaluation_metadata": {}}, "ncbi_disease": {"name": "ncbi_disease", "description": "This paper presents the disease name and concept annotations of the NCBI disease corpus, a collection of 793 PubMed\nabstracts fully annotated at the mention and concept level to serve as a research resource for the biomedical natural\nlanguage processing community. Each PubMed abstract was manually annotated by two annotators with disease mentions\nand their corresponding concepts in Medical Subject Headings (MeSH\u00ae) or Online Mendelian Inheritance in Man (OMIM\u00ae).\nManual curation was performed using PubTator, which allowed the use of pre-annotations as a pre-step to manual annotations.\nFourteen annotators were randomly paired and differing annotations were discussed for reaching a consensus in two\nannotation phases. In this setting, a high inter-annotator agreement was observed. Finally, all results were checked\nagainst annotations of the rest of the corpus to assure corpus-wide consistency.\n\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3951655/\n\nThe original dataset can be downloaded from: https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBI_corpus.zip\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll\nNote: there is a duplicate document (PMID 8528200) in the original data, and the duplicate is recreated in the converted data.", "evaluation_metadata": [{"config": "ncbi_disease", "task": "token-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "text", "ner_tags": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "ncslgr": {"name": "ncslgr", "description": "A small corpus of American Sign Language (ASL) video data from native signers, annotated with non-manual features.", "evaluation_metadata": {}}, "nell": {"name": "nell", "description": "This dataset provides version 1115 of the belief\nextracted by CMU's Never Ending Language Learner (NELL) and version\n1110 of the candidate belief extracted by NELL. See\nhttp://rtw.ml.cmu.edu/rtw/overview. NELL is an open information\nextraction system that attempts to read the Clueweb09 of 500 million\nweb pages (http://boston.lti.cs.cmu.edu/Data/clueweb09/) and general\nweb searches.\n\nThe dataset has 4 configurations: nell_belief, nell_candidate,\nnell_belief_sentences, and nell_candidate_sentences. nell_belief is\ncertainties of belief are lower. The two sentences config extracts the\nCPL sentence patterns filled with the applicable 'best' literal string\nfor the entities filled into the sentence patterns. And also provides\nsentences found using web searches containing the entities and\nrelationships.\n\nThere are roughly 21M entries for nell_belief_sentences, and 100M\nsentences for nell_candidate_sentences.", "evaluation_metadata": {}}, "neural_code_search": {"name": "neural_code_search", "description": "Neural-Code-Search-Evaluation-Dataset presents an evaluation dataset consisting of natural language query and code snippet pairs and a search corpus consisting of code snippets collected from the most popular Android repositories on GitHub.", "evaluation_metadata": {}}, "news_commentary": {"name": "news_commentary", "description": "A parallel corpus of News Commentaries provided by WMT for training SMT. The source is taken from CASMACAT: http://www.casmacat.eu/corpus/news-commentary.html\n\n12 languages, 63 bitexts\ntotal number of files: 61,928\ntotal number of tokens: 49.66M\ntotal number of sentence fragments: 1.93M", "evaluation_metadata": {}}, "newsgroup": {"name": "newsgroup", "description": "The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.", "evaluation_metadata": {}}, "newsph": {"name": "newsph", "description": "Large-scale dataset of Filipino news articles. Sourced for the NewsPH-NLI Project (Cruz et al., 2020).", "evaluation_metadata": {}}, "newsph_nli": {"name": "newsph_nli", "description": "First benchmark dataset for sentence entailment in the low-resource Filipino language.\nConstructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,\nin 70-15-15 split for training, validation, and testing.", "evaluation_metadata": {}}, "newspop": {"name": "newspop", "description": "This is a large data set of news items and their respective social feedback on multiple platforms: Facebook, Google+ and LinkedIn.\nThe collected data relates to a period of 8 months, between November 2015 and July 2016, accounting for about 100,000 news items on four different topics: economy, microsoft, obama and palestine.\nThis data set is tailored for evaluative comparisons in predictive analytics tasks, although allowing for tasks in other research areas such as topic detection and tracking, sentiment analysis in short text, first story detection or news recommendation.", "evaluation_metadata": {}}, "nkjp-ner": {"name": "nkjp-ner", "description": "The NKJP-NER is based on a human-annotated part of National Corpus of Polish (NKJP). We extracted sentences with named entities of exactly one type. The task is to predict the type of the named entity.", "evaluation_metadata": {}}, "nli_tr": {"name": "nli_tr", "description": "\\\r\nThe Natural Language Inference in Turkish (NLI-TR) is a set of two large scale datasets that were obtained by translating the foundational NLI corpora (SNLI and MNLI) using Amazon Translate.", "evaluation_metadata": {}}, "nlu_evaluation_data": {"name": "nlu_evaluation_data", "description": "Raw part of NLU Evaluation Data. It contains 25 715 non-empty examples (original dataset has 25716 examples) from 68 unique intents belonging to 18 scenarios.", "evaluation_metadata": {}}, "norec": {"name": "norec", "description": "NoReC was created as part of the SANT project (Sentiment Analysis for Norwegian Text), a collaboration between the Language Technology Group (LTG) at the Department of Informatics at the University of Oslo, the Norwegian Broadcasting Corporation (NRK), Schibsted Media Group and Aller Media. This first release of the corpus comprises 35,194 reviews extracted from eight different news sources: Dagbladet, VG, Aftenposten, Bergens Tidende, F\u00e6drelandsvennen, Stavanger Aftenblad, DinSide.no and P3.no. In terms of publishing date the reviews mainly cover the time span 2003\u20132017, although it also includes a handful of reviews dating back as far as 1998.", "evaluation_metadata": {}}, "norne": {"name": "norne", "description": "NorNE is a manually annotated\ncorpus of named entities which extends the annotation of the existing\nNorwegian Dependency Treebank. Comprising both of the official standards of\nwritten Norwegian (Bokm\u00e5l and Nynorsk), the corpus contains around 600,000\ntokens and annotates a rich set of entity types including persons,\norganizations, locations, geo-political entities, products, and events,\nin addition to a class corresponding to nominals derived from names.", "evaluation_metadata": {}}, "norwegian_ner": {"name": "norwegian_ner", "description": "Named entities Recognition dataset for Norwegian. It is\na version of the Universal Dependency (UD) Treebank for both Bokm\u00e5l and Nynorsk (UDN) where\nall proper nouns have been tagged with their type according to the NER tagging scheme. UDN is a converted\nversion of the Norwegian Dependency Treebank into the UD scheme.", "evaluation_metadata": {}}, "nq_open": {"name": "nq_open", "description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.", "evaluation_metadata": {}}, "nsmc": {"name": "nsmc", "description": "This is a movie review dataset in the Korean language. Reviews were scraped from Naver movies. The dataset construction is based on the method noted in Large movie review dataset from Maas et al., 2011.", "evaluation_metadata": {}}, "numer_sense": {"name": "numer_sense", "description": "NumerSense is a new numerical commonsense reasoning probing task, with a diagnostic dataset consisting of 3,145 masked-word-prediction probes.\n\nWe propose to study whether numerical commonsense knowledge can be induced from pre-trained language models like BERT, and to what extent this access to knowledge robust against adversarial examples is. We hope this will be beneficial for tasks such as knowledge base completion and open-domain question answering.", "evaluation_metadata": {}}, "numeric_fused_head": {"name": "numeric_fused_head", "description": "Fused Head constructions are noun phrases in which the head noun is missing and is said to be \"fused\" with its dependent modifier. This missing information is implicit and is important for sentence understanding.The missing heads are easily filled in by humans, but pose a challenge for computational models.\n\nFor example, in the sentence: \"I bought 5 apples but got only 4.\", 4 is a Fused-Head, and the missing head is apples, which appear earlier in the sentence.\n\nThis is a crowd-sourced dataset of 10k numerical fused head examples (1M tokens).", "evaluation_metadata": {}}, "oclar": {"name": "oclar", "description": "The researchers of OCLAR Marwan et al. (2019), they gathered Arabic costumer reviews from Google reviewsa and Zomato\nwebsite (https://www.zomato.com/lebanon) on wide scope of domain, including restaurants, hotels, hospitals, local shops,\netc.The corpus finally contains 3916 reviews in 5-rating scale. For this research purpose, the positive class considers\nrating stars from 5 to 3 of 3465 reviews, and the negative class is represented from values of 1 and 2 of about\n451 texts.", "evaluation_metadata": {}}, "offcombr": {"name": "offcombr", "description": "OffComBR: an annotated dataset containing for hate speech detection in Portuguese composed of news comments on the Brazilian Web.", "evaluation_metadata": {}}, "offenseval2020_tr": {"name": "offenseval2020_tr", "description": "OffensEval-TR 2020 is a Turkish offensive language corpus. The corpus consist of randomly sampled tweets and annotated in a similar way to OffensEval and GermEval.", "evaluation_metadata": {}}, "offenseval_dravidian": {"name": "offenseval_dravidian", "description": "Offensive language identification in dravidian lanaguages dataset. The goal of this task is to identify offensive language content of the code-mixed dataset of comments/posts in Dravidian Languages ( (Tamil-English, Malayalam-English, and Kannada-English)) collected from social media.", "evaluation_metadata": {}}, "ofis_publik": {"name": "ofis_publik", "description": "Texts from the Ofis Publik ar Brezhoneg (Breton Language Board) provided by Francis Tyers\n2 languages, total number of files: 278\ntotal number of tokens: 2.12M\ntotal number of sentence fragments: 0.13M", "evaluation_metadata": {}}, "ohsumed": {"name": "ohsumed", "description": "The OHSUMED test collection is a set of 348,566 references from\nMEDLINE, the on-line medical information database, consisting of\ntitles and/or abstracts from 270 medical journals over a five-year\nperiod (1987-1991). The available fields are title, abstract, MeSH\nindexing terms, author, source, and publication type.", "evaluation_metadata": {}}, "omp": {"name": "omp", "description": "The \u201cOne Million Posts\u201d corpus is an annotated data set consisting of\nuser comments posted to an Austrian newspaper website (in German language).\n\nDER STANDARD is an Austrian daily broadsheet newspaper. On the newspaper\u2019s website,\nthere is a discussion section below each news article where readers engage in\nonline discussions. The data set contains a selection of user posts from the\n12 month time span from 2015-06-01 to 2016-05-31. There are 11,773 labeled and\n1,000,000 unlabeled posts in the data set. The labeled posts were annotated by\nprofessional forum moderators employed by the newspaper.\n\nThe data set contains the following data for each post:\n\n* Post ID\n* Article ID\n* Headline (max. 250 characters)\n* Main Body (max. 750 characters)\n* User ID (the user names used by the website have been re-mapped to new numeric IDs)\n* Time stamp\n* Parent post (replies give rise to tree-like discussion thread structures)\n* Status (online or deleted by a moderator)\n* Number of positive votes by other community members\n* Number of negative votes by other community members\n\nFor each article, the data set contains the following data:\n\n* Article ID\n* Publishing date\n* Topic Path (e.g.: Newsroom / Sports / Motorsports / Formula 1)\n* Title\n* Body\n\nDetailed descriptions of the post selection and annotation procedures are given in the paper.\n\n## Annotated Categories\n\nPotentially undesirable content:\n\n* Sentiment (negative/neutral/positive)\n An important goal is to detect changes in the prevalent sentiment in a discussion, e.g.,\n the location within the fora and the point in time where a turn from positive/neutral\n sentiment to negative sentiment takes place.\n* Off-Topic (yes/no)\n Posts which digress too far from the topic of the corresponding article.\n* Inappropriate (yes/no)\n Swearwords, suggestive and obscene language, insults, threats etc.\n* Discriminating (yes/no)\n Racist, sexist, misogynistic, homophobic, antisemitic and other misanthropic content.\n\nNeutral content that requires a reaction:\n\n* Feedback (yes/no)\n Sometimes users ask questions or give feedback to the author of the article or the\n newspaper in general, which may require a reply/reaction.\n\nPotentially desirable content:\n\n* Personal Stories (yes/no)\n In certain fora, users are encouraged to share their personal stories, experiences,\n anecdotes etc. regarding the respective topic.\n* Arguments Used (yes/no)\n It is desirable for users to back their statements with rational argumentation,\n reasoning and sources.", "evaluation_metadata": {}}, "onestop_english": {"name": "onestop_english", "description": "This dataset is a compilation of the OneStopEnglish corpus of texts written at three reading levels into one file.\nText documents are classified into three reading levels - ele, int, adv (Elementary, Intermediate and Advance).\nThis dataset demonstrates its usefulness for through two applica-tions - automatic readability assessment and automatic text simplification.\nThe corpus consists of 189 texts, each in three versions/reading levels (567 in total).", "evaluation_metadata": {}}, "onestop_qa": {"name": "onestop_qa", "description": "OneStopQA is a multiple choice reading comprehension dataset annotated according to the STARC (Structured Annotations for Reading Comprehension) scheme. The reading materials are Guardian articles taken from the [OneStopEnglish corpus](https://github.com/nishkalavallabhi/OneStopEnglishCorpus). Each article comes in three difficulty levels, Elementary, Intermediate and Advanced. Each paragraph is annotated with three multiple choice reading comprehension questions. The reading comprehension questions can be answered based on any of the three paragraph levels.", "evaluation_metadata": {}}, "open_subtitles": {"name": "open_subtitles", "description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G", "evaluation_metadata": {}}, "openai_humaneval": {"name": "openai_humaneval", "description": "The HumanEval dataset released by OpenAI contains 164 handcrafted programming challenges together with unittests to very the viability of a proposed solution.", "evaluation_metadata": {}}, "openbookqa": {"name": "openbookqa", "description": "OpenBookQA aims to promote research in advanced question-answering, probing a deeper understanding of both the topic\n(with salient facts summarized as an open book, also provided with the dataset) and the language it is expressed in. In\nparticular, it contains questions that require multi-step reasoning, use of additional common and commonsense knowledge,\nand rich text comprehension.\nOpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding\nof a subject.", "evaluation_metadata": {}}, "openslr": {"name": "openslr", "description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.", "evaluation_metadata": {}}, "Skylion007/openwebtext": {"name": "Skylion007/openwebtext", "description": "An open-source replication of the WebText dataset from OpenAI.", "evaluation_metadata": {}}, "opinosis": {"name": "opinosis", "description": "The Opinosis Opinion Dataset consists of sentences extracted from reviews for 51 topics.\nTopics and opinions are obtained from Tripadvisor, Edmunds.com and Amazon.com.", "evaluation_metadata": {}}, "opus100": {"name": "opus100", "description": "OPUS-100 is English-centric, meaning that all training pairs include English on either the source or target side.\nThe corpus covers 100 languages (including English).OPUS-100 contains approximately 55M sentence pairs.\nOf the 99 language pairs, 44 have 1M sentence pairs of training data, 73 have at least 100k, and 95 have at least 10k.", "evaluation_metadata": {}}, "opus_books": {"name": "opus_books", "description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M", "evaluation_metadata": {}}, "opus_dgt": {"name": "opus_dgt", "description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M", "evaluation_metadata": {}}, "opus_dogc": {"name": "opus_dogc", "description": "This is a collection of documents from the Official Journal of the Government of Catalonia, in Catalan and Spanish languages, provided by Antoni Oliver Gonzalez from the Universitat Oberta de Catalunya.", "evaluation_metadata": {}}, "opus_elhuyar": {"name": "opus_elhuyar", "description": "Dataset provided by the foundation Elhuyar, which is having data in languages Spanish to Basque.", "evaluation_metadata": {}}, "opus_euconst": {"name": "opus_euconst", "description": "A parallel corpus collected from the European Constitution for 21 language.", "evaluation_metadata": {}}, "opus_finlex": {"name": "opus_finlex", "description": "The Finlex Data Base is a comprehensive collection of legislative and other judicial information of Finland, which is available in Finnish, Swedish and partially in English. This corpus is taken from the Semantic Finlex serice that provides the Finnish and Swedish data as linked open data and also raw XML files.", "evaluation_metadata": {}}, "opus_fiskmo": {"name": "opus_fiskmo", "description": "fiskmo, a massive parallel corpus for Finnish and Swedish.", "evaluation_metadata": {}}, "opus_gnome": {"name": "opus_gnome", "description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M", "evaluation_metadata": {}}, "opus_infopankki": {"name": "opus_infopankki", "description": "A parallel corpus of 12 languages, 66 bitexts.", "evaluation_metadata": {}}, "opus_memat": {"name": "opus_memat", "description": "Xhosa-English parallel corpora, funded by EPSRC, the Medical Machine Translation project worked on machine translation between ixiXhosa and English, with a focus on the medical domain.", "evaluation_metadata": {}}, "opus_montenegrinsubs": {"name": "opus_montenegrinsubs", "description": "Opus MontenegrinSubs dataset for machine translation task, for language pair en-me: english and montenegrin", "evaluation_metadata": {}}, "opus_openoffice": {"name": "opus_openoffice", "description": "A collection of documents from http://www.openoffice.org/.", "evaluation_metadata": {}}, "opus_paracrawl": {"name": "opus_paracrawl", "description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G", "evaluation_metadata": {}}, "opus_rf": {"name": "opus_rf", "description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.", "evaluation_metadata": {}}, "opus_tedtalks": {"name": "opus_tedtalks", "description": "This is a Croatian-English parallel corpus of transcribed and translated TED talks, originally extracted from https://wit3.fbk.eu. The corpus is compiled by \u017deljko Agi\u0107 and is taken from http://lt.ffzg.hr/zagic provided under the CC-BY-NC-SA license.\n2 languages, total number of files: 2\ntotal number of tokens: 2.81M\ntotal number of sentence fragments: 0.17M", "evaluation_metadata": {}}, "opus_ubuntu": {"name": "opus_ubuntu", "description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M", "evaluation_metadata": {}}, "opus_wikipedia": {"name": "opus_wikipedia", "description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M", "evaluation_metadata": {}}, "opus_xhosanavy": {"name": "opus_xhosanavy", "description": "This dataset is designed for machine translation from English to Xhosa.", "evaluation_metadata": {}}, "orange_sum": {"name": "orange_sum", "description": "The OrangeSum dataset was inspired by the XSum dataset. It was created by scraping the \"Orange Actu\" website: https://actu.orange.fr/. Orange S.A. is a large French multinational telecommunications corporation, with 266M customers worldwide. Scraped pages cover almost a decade from Feb 2011 to Sep 2020. They belong to five main categories: France, world, politics, automotive, and society. The society category is itself divided into 8 subcategories: health, environment, people, culture, media, high-tech, unsual (\"insolite\" in French), and miscellaneous.\n\nEach article featured a single-sentence title as well as a very brief abstract, both professionally written by the author of the article. These two fields were extracted from each page, thus creating two summarization tasks: OrangeSum Title and OrangeSum Abstract.", "evaluation_metadata": {}}, "oscar": {"name": "oscar", "description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.\\", "evaluation_metadata": {}}, "para_pat": {"name": "para_pat", "description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.", "evaluation_metadata": {}}, "parsinlu_reading_comprehension": {"name": "parsinlu_reading_comprehension", "description": "A Persian reading comprehenion task (generating an answer, given a question and a context paragraph).\nThe questions are mined using Google auto-complete, their answers and the corresponding evidence documents are manually annotated by native speakers.", "evaluation_metadata": {}}, "paws-x": {"name": "paws-x", "description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "evaluation_metadata": {}}, "paws": {"name": "paws", "description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "evaluation_metadata": {}}, "pec": {"name": "pec", "description": "\\\r\nA dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.", "evaluation_metadata": {}}, "per_sent": {"name": "per_sent", "description": "Person SenTiment (PerSenT) is a crowd-sourced dataset that captures the sentiment of an author towards the main entity in a news article. This dataset contains annotation for 5.3k documents and 38k paragraphs covering 3.2k unique entities.\n\nThe dataset consists of sentiment annotations on news articles about people. For each article, annotators judge what the author\u2019s sentiment is towards the main (target) entity of the article. The annotations also include similar judgments on paragraphs within the article.\n\nTo split the dataset, entities into 4 mutually exclusive sets. Due to the nature of news collections, some entities tend to dominate the collection. In the collection, there were four entities which were the main entity in nearly 800 articles. To avoid these entities from dominating the train or test splits, we moved them to a separate test collection. We split the remaining into a training, dev, and test sets at random. Thus our collection includes one standard test set consisting of articles drawn at random (Test Standard -- `test_random`), while the other is a test set which contains multiple articles about a small number of popular entities (Test Frequent -- `test_fixed`).", "evaluation_metadata": {}}, "persian_ner": {"name": "persian_ner", "description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.", "evaluation_metadata": {}}, "pg19": {"name": "pg19", "description": "This repository contains the PG-19 language modeling benchmark.\nIt includes a set of books extracted from the Project Gutenberg books library, that were published before 1919.\nIt also contains metadata of book titles and publication dates.\n\nPG-19 is over double the size of the Billion Word benchmark and contains documents that are 20X longer, on average, than the WikiText long-range language modelling benchmark.\nBooks are partitioned into a train, validation, and test set. Book metadata is stored in metadata.csv which contains (book_id, short_book_title, publication_date).\n\nUnlike prior benchmarks, we do not constrain the vocabulary size --- i.e. mapping rare words to an UNK token --- but instead release the data as an open-vocabulary benchmark. The only processing of the text that has been applied is the removal of boilerplate license text, and the mapping of offensive discriminatory words as specified by Ofcom to placeholder tokens. Users are free to model the data at the character-level, subword-level, or via any mechanism that can model an arbitrary string of text.\nTo compare models we propose to continue measuring the word-level perplexity, by calculating the total likelihood of the dataset (via any chosen subword vocabulary or character-based scheme) divided by the number of tokens --- specified below in the dataset statistics table.\nOne could use this dataset for benchmarking long-range language models, or use it to pre-train for other natural language processing tasks which require long-range reasoning, such as LAMBADA or NarrativeQA. We would not recommend using this dataset to train a general-purpose language model, e.g. for applications to a production-system dialogue agent, due to the dated linguistic style of old texts and the inherent biases present in historical writing.", "evaluation_metadata": {}}, "php": {"name": "php", "description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M", "evaluation_metadata": {}}, "etalab-ia/piaf": {"name": "etalab-ia/piaf", "description": "Piaf is a reading comprehension dataset. This version, published in February 2020, contains 3835 questions on French Wikipedia.", "evaluation_metadata": {}}, "pib": {"name": "pib", "description": "Sentence aligned parallel corpus between 11 Indian Languages, crawled and extracted from the press information bureau\nwebsite.", "evaluation_metadata": {}}, "piqa": {"name": "piqa", "description": "To apply eyeshadow without a brush, should I use a cotton swab or a toothpick?\nQuestions requiring this kind of physical commonsense pose a challenge to state-of-the-art\nnatural language understanding systems. The PIQA dataset introduces the task of physical commonsense reasoning\nand a corresponding benchmark dataset Physical Interaction: Question Answering or PIQA.\n\nPhysical commonsense knowledge is a major challenge on the road to true AI-completeness,\nincluding robots that interact with the world and understand natural language.\n\nPIQA focuses on everyday situations with a preference for atypical solutions.\nThe dataset is inspired by instructables.com, which provides users with instructions on how to build, craft,\nbake, or manipulate objects using everyday materials.\n\nThe underlying task is formualted as multiple choice question answering:\ngiven a question `q` and two possible solutions `s1`, `s2`, a model or\na human must choose the most appropriate solution, of which exactly one is correct.\nThe dataset is further cleaned of basic artifacts using the AFLite algorithm which is an improvement of\nadversarial filtering. The dataset contains 16,000 examples for training, 2,000 for development and 3,000 for testing.", "evaluation_metadata": {}}, "pn_summary": {"name": "pn_summary", "description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\\n\")`) and then use them for your purposes.", "evaluation_metadata": {}}, "poem_sentiment": {"name": "poem_sentiment", "description": "Poem Sentiment is a sentiment dataset of poem verses from Project Gutenberg. This dataset can be used for tasks such as sentiment classification or style transfer for poems.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"verse_text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "polemo2": {"name": "polemo2", "description": "The PolEmo2.0 is a set of online reviews from medicine and hotels domains. The task is to predict the sentiment of a review. There are two separate test sets, to allow for in-domain (medicine and hotels) as well as out-of-domain (products and university) validation.", "evaluation_metadata": {}}, "poleval2019_cyberbullying": {"name": "poleval2019_cyberbullying", "description": " In Task 6-1, the participants are to distinguish between normal/non-harmful tweets (class: 0) and tweets\n that contain any kind of harmful information (class: 1). This includes cyberbullying, hate speech and\n related phenomena.\n\n In Task 6-2, the participants shall distinguish between three classes of tweets: 0 (non-harmful),\n 1 (cyberbullying), 2 (hate-speech). There are various definitions of both cyberbullying and hate-speech,\n some of them even putting those two phenomena in the same group. The specific conditions on which we based\n our annotations for both cyberbullying and hate-speech, which have been worked out during ten years of research\n will be summarized in an introductory paper for the task, however, the main and definitive condition to 1\n distinguish the two is whether the harmful action is addressed towards a private person(s) (cyberbullying),\n or a public person/entity/large group (hate-speech).", "evaluation_metadata": {}}, "polyglot_ner": {"name": "polyglot_ner", "description": "Polyglot-NER\nA training dataset automatically generated from Wikipedia and Freebase the task\nof named entity recognition. The dataset contains the basic Wikipedia based\ntraining data for 40 languages we have (with coreference resolution) for the task of\nnamed entity recognition. The details of the procedure of generating them is outlined in\nSection 3 of the paper (https://arxiv.org/abs/1410.3791). Each config contains the data\ncorresponding to a different language. For example, \"es\" includes only spanish examples.", "evaluation_metadata": {}}, "prachathai67k": {"name": "prachathai67k", "description": "`prachathai-67k`: News Article Corpus and Multi-label Text Classificdation from Prachathai.com\nThe prachathai-67k dataset was scraped from the news site Prachathai.\nWe filtered out those articles with less than 500 characters of body text, mostly images and cartoons.\nIt contains 67,889 articles wtih 12 curated tags from August 24, 2004 to November 15, 2018.\nThe dataset was originally scraped by @lukkiddd and cleaned by @cstorm125.\nYou can also see preliminary exploration at https://github.com/PyThaiNLP/prachathai-67k/blob/master/exploration.ipynb", "evaluation_metadata": {}}, "pragmeval": {"name": "pragmeval", "description": "Evaluation of language understanding with a 11 datasets benchmark focusing on discourse and pragmatics", "evaluation_metadata": {}}, "proto_qa": {"name": "proto_qa", "description": "This dataset is for studying computational models trained to reason about prototypical situations. Using deterministic filtering a sampling from a larger set of all transcriptions was built. It contains 9789 instances where each instance represents a survey question from Family Feud game. Each instance exactly is a question, a set of answers, and a count associated with each answer.\nEach line is a json dictionary, in which:\n1. question - contains the question (in original and a normalized form)\n2. answerstrings - contains the original answers provided by survey respondents (when available), along with the counts for each string. Because the FamilyFeud data has only cluster names rather than strings, those cluster names are included with 0 weight.\n3. answer-clusters - lists clusters, with the count of each cluster and the strings included in that cluster. Each cluster is given a unique ID that can be linked to in the assessment files.", "evaluation_metadata": {}}, "psc": {"name": "psc", "description": "The Polish Summaries Corpus contains news articles and their summaries. We used summaries of the same article as positive pairs and sampled the most similar summaries of different articles as negatives.", "evaluation_metadata": {}}, "ptb_text_only": {"name": "ptb_text_only", "description": "This is the Penn Treebank Project: Release 2 CDROM, featuring a million words of 1989 Wall Street Journal material. This corpus has been annotated for part-of-speech (POS) information. In addition, over half of it has been annotated for skeletal syntactic structure.", "evaluation_metadata": {}}, "pubmed": {"name": "pubmed", "description": "NLM produces a baseline set of MEDLINE/PubMed citation records in XML format for download on an annual basis. The annual baseline is released in December of each year. Each day, NLM produces update files that include new, revised and deleted citations. See our documentation page for more information.", "evaluation_metadata": {}}, "pubmed_qa": {"name": "pubmed_qa", "description": "PubMedQA is a novel biomedical question answering (QA) dataset collected from PubMed abstracts.\nThe task of PubMedQA is to answer research questions with yes/no/maybe (e.g.: Do preoperative\nstatins reduce atrial fibrillation after coronary artery bypass grafting?) using the corresponding abstracts.\nPubMedQA has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA instances.\nEach PubMedQA instance is composed of (1) a question which is either an existing research article\ntitle or derived from one, (2) a context which is the corresponding abstract without its conclusion,\n(3) a long answer, which is the conclusion of the abstract and, presumably, answers the research question,\nand (4) a yes/no/maybe answer which summarizes the conclusion.\nPubMedQA is the first QA dataset where reasoning over biomedical research texts, especially their\nquantitative contents, is required to answer the questions.", "evaluation_metadata": {}}, "py_ast": {"name": "py_ast", "description": "Dataset consisting of parsed ASTs that were used to train and\nevaluate the DeepSyn tool.\nThe Python programs are collected from GitHub repositories\nby removing duplicate files, removing project forks (copy of another existing repository)\n,keeping only programs that parse and have at most 30'000 nodes in the AST and\nwe aim to remove obfuscated files", "evaluation_metadata": {}}, "qa4mre": {"name": "qa4mre", "description": "QA4MRE dataset was created for the CLEF 2011/2012/2013 shared tasks to promote research in\nquestion answering and reading comprehension. The dataset contains a supporting\npassage and a set of questions corresponding to the passage. Multiple options\nfor answers are provided for each question, of which only one is correct. The\ntraining and test datasets are available for the main track.\nAdditional gold standard documents are available for two pilot studies: one on\nalzheimers data, and the other on entrance exams data.", "evaluation_metadata": {}}, "qa_srl": {"name": "qa_srl", "description": "The dataset contains question-answer pairs to model verbal predicate-argument structure. The questions start with wh-words (Who, What, Where, What, etc.) and contain a verb predicate in the sentence; the answers are phrases in the sentence.\nThere were 2 datsets used in the paper, newswire and wikipedia. Unfortunately the newswiredataset is built from CoNLL-2009 English training set that is covered under license\nThus, we are providing only Wikipedia training set here. Please check README.md for more details on newswire dataset.\nFor the Wikipedia domain, randomly sampled sentences from the English Wikipedia (excluding questions and sentences with fewer than 10 or more than 60 words) were taken.\nThis new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "qa_zre": {"name": "qa_zre", "description": "A dataset reducing relation extraction to simple reading comprehension questions", "evaluation_metadata": {}}, "qangaroo": {"name": "qangaroo", "description": " We have created two new Reading Comprehension datasets focussing on multi-hop (alias multi-step) inference.\n\nSeveral pieces of information often jointly imply another fact. In multi-hop inference, a new fact is derived by combining facts via a chain of multiple steps.\n\nOur aim is to build Reading Comprehension methods that perform multi-hop inference on text, where individual facts are spread out across different documents.\n\nThe two QAngaroo datasets provide a training and evaluation resource for such methods.", "evaluation_metadata": {}}, "qanta": {"name": "qanta", "description": "The Qanta dataset is a question answering dataset based on the academic trivia game Quizbowl.", "evaluation_metadata": {}}, "qasc": {"name": "qasc", "description": "QASC is a question-answering dataset with a focus on sentence composition. It consists of 9,980 8-way multiple-choice\nquestions about grade school science (8,134 train, 926 dev, 920 test), and comes with a corpus of 17M sentences.", "evaluation_metadata": {}}, "allenai/qasper": {"name": "allenai/qasper", "description": "A dataset containing 1585 papers with 5049 information-seeking questions asked by regular readers of NLP papers, and answered by a separate set of NLP practitioners.", "evaluation_metadata": {}}, "qed": {"name": "qed", "description": "QED, is a linguistically informed, extensible framework for explanations in question answering. A QED explanation specifies the relationship between a question and answer according to formal semantic notions such as referential equality, sentencehood, and entailment. It is an expertannotated dataset of QED explanations built upon a subset of the Google Natural Questions dataset.", "evaluation_metadata": {}}, "qed_amara": {"name": "qed_amara", "description": "The QCRI Educational Domain Corpus (formerly QCRI AMARA Corpus) is an open multilingual collection of subtitles for educational videos and lectures collaboratively transcribed and translated over the AMARA web-based platform.\nDeveloped by: Qatar Computing Research Institute, Arabic Language Technologies Group\nThe QED Corpus is made public for RESEARCH purpose only.\nThe corpus is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Copyright Qatar Computing Research Institute. All rights reserved.\n225 languages, 9,291 bitexts\ntotal number of files: 271,558\ntotal number of tokens: 371.76M\ntotal number of sentence fragments: 30.93M", "evaluation_metadata": {}}, "quac": {"name": "quac", "description": "Question Answering in Context is a dataset for modeling, understanding,\nand participating in information seeking dialog. Data instances consist\nof an interactive dialog between two crowd workers: (1) a student who\nposes a sequence of freeform questions to learn as much as possible\nabout a hidden Wikipedia text, and (2) a teacher who answers the questions\nby providing short excerpts (spans) from the text. QuAC introduces\nchallenges not found in existing machine comprehension datasets: its\nquestions are often more open-ended, unanswerable, or only meaningful\nwithin the dialog context.", "evaluation_metadata": {}}, "quail": {"name": "quail", "description": "QuAIL is a reading comprehension dataset. QuAIL contains 15K multi-choice questions in texts 300-350 tokens long 4 domains (news, user stories, fiction, blogs).QuAIL is balanced and annotated for question types.\\", "evaluation_metadata": {}}, "quarel": {"name": "quarel", "description": "QuaRel is a crowdsourced dataset of 2771 multiple-choice story questions, including their logical forms.", "evaluation_metadata": {}}, "quartz": {"name": "quartz", "description": "QuaRTz is a crowdsourced dataset of 3864 multiple-choice questions about open domain qualitative relationships. Each\nquestion is paired with one of 405 different background sentences (sometimes short paragraphs).\nThe QuaRTz dataset V1 contains 3864 questions about open domain qualitative relationships. Each question is paired with\none of 405 different background sentences (sometimes short paragraphs).\nThe dataset is split into train (2696), dev (384) and test (784). A background sentence will only appear in a single split.", "evaluation_metadata": {}}, "quoref": {"name": "quoref", "description": "Quoref is a QA dataset which tests the coreferential reasoning capability of reading comprehension systems. In this\nspan-selection benchmark containing 24K questions over 4.7K paragraphs from Wikipedia, a system must resolve hard\ncoreferences before selecting the appropriate span(s) in the paragraphs for answering questions.", "evaluation_metadata": {}}, "race": {"name": "race", "description": "Race is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions. The\n dataset is collected from English examinations in China, which are designed for middle school and high school students.\nThe dataset can be served as the training and test sets for machine comprehension.", "evaluation_metadata": {}}, "re_dial": {"name": "re_dial", "description": "ReDial (Recommendation Dialogues) is an annotated dataset of dialogues, where users\nrecommend movies to each other. The dataset was collected by a team of researchers working at\nPolytechnique Montr\u00e9al, MILA \u2013 Quebec AI Institute, Microsoft Research Montr\u00e9al, HEC Montreal, and Element AI.\n\nThe dataset allows research at the intersection of goal-directed dialogue systems\n(such as restaurant recommendation) and free-form (also called \u201cchit-chat\u201d) dialogue systems.", "evaluation_metadata": {}}, "reasoning_bg": {"name": "reasoning_bg", "description": "This new dataset is designed to do reading comprehension in Bulgarian language.", "evaluation_metadata": {}}, "red_caps": {"name": "red_caps", "description": "RedCaps is a large-scale dataset of 12M image-text pairs collected from Reddit.\nImages and captions from Reddit depict and describe a wide variety of objects and scenes.\nThe data is collected from a manually curated set of subreddits (350 total),\nwhich give coarse image labels and allow steering of the dataset composition\nwithout labeling individual instances.", "evaluation_metadata": {}}, "webis/tldr-17": {"name": "webis/tldr-17", "description": "This corpus contains preprocessed posts from the Reddit dataset.\nThe dataset consists of 3,848,330 posts with an average length of 270 words for content,\nand 28 words for the summary.\n\nFeatures includes strings: author, body, normalizedBody, content, summary, subreddit, subreddit_id.\nContent is used as document and summary is used as summary.", "evaluation_metadata": [{"config": "default", "task": "summarization", "task_id": "summarization", "splits": {"train_split": "train"}, "col_mapping": {"content": "text", "summary": "target"}, "metrics": [{"type": "rouge", "name": "Rouge"}]}]}, "reddit_tifu": {"name": "reddit_tifu", "description": "Reddit dataset, where TIFU denotes the name of subbreddit /r/tifu.\nAs defined in the publication, styel \"short\" uses title as summary and\n\"long\" uses tldr as summary.\n\nFeatures includes:\n - document: post text without tldr.\n - tldr: tldr line.\n - title: trimmed title without tldr.\n - ups: upvotes.\n - score: score.\n - num_comments: number of comments.\n - upvote_ratio: upvote ratio.", "evaluation_metadata": {}}, "reuters21578": {"name": "reuters21578", "description": "The Reuters-21578 dataset is one of the most widely used data collections for text\ncategorization research. It is collected from the Reuters financial newswire service in 1987.", "evaluation_metadata": {}}, "riddle_sense": {"name": "riddle_sense", "description": "Answering such a riddle-style question is a challenging cognitive process, in that it requires\ncomplex commonsense reasoning abilities, an understanding of figurative language, and counterfactual reasoning\nskills, which are all important abilities for advanced natural language understanding (NLU). However,\nthere is currently no dedicated datasets aiming to test these abilities. Herein, we present RiddleSense,\na new multiple-choice question answering task, which comes with the first large dataset (5.7k examples) for answering\nriddle-style commonsense questions. We systematically evaluate a wide range of models over the challenge,\nand point out that there is a large gap between the best-supervised model and human performance \u2014 suggesting\nintriguing future research in the direction of higher-order commonsense reasoning and linguistic creativity towards\nbuilding advanced NLU systems.", "evaluation_metadata": {}}, "ro_sent": {"name": "ro_sent", "description": "This dataset is a Romanian Sentiment Analysis dataset.\nIt is present in a processed form, as used by the authors of `Romanian Transformers`\nin their examples and based on the original data present in\n`https://github.com/katakonst/sentiment-analysis-tensorflow`. The original dataset is collected\nfrom product and movie reviews in Romanian.", "evaluation_metadata": {}}, "ro_sts": {"name": "ro_sts", "description": "The RO-STS (Romanian Semantic Textual Similarity) dataset contains 8628 pairs of sentences with their similarity score. It is a high-quality translation of the STS benchmark dataset.", "evaluation_metadata": {}}, "roman_urdu": {"name": "roman_urdu", "description": "This is an extensive compilation of Roman Urdu Dataset (Urdu written in Latin/Roman script) tagged for sentiment analysis.", "evaluation_metadata": {}}, "ronec": {"name": "ronec", "description": "RONEC - the Romanian Named Entity Corpus, at version 2.0, holds 12330 sentences with over 0.5M tokens, annotated with 15 classes, to a total of 80.283 distinctly annotated entities. It is used for named entity recognition and represents the largest Romanian NER corpus to date.", "evaluation_metadata": {}}, "ropes": {"name": "ropes", "description": "ROPES (Reasoning Over Paragraph Effects in Situations) is a QA dataset\nwhich tests a system's ability to apply knowledge from a passage\nof text to a new situation. A system is presented a background\npassage containing a causal or qualitative relation(s) (e.g.,\n\"animal pollinators increase efficiency of fertilization in flowers\"),\na novel situation that uses this background, and questions that require\nreasoning about effects of the relationships in the background\npassage in the background of the situation.", "evaluation_metadata": {}}, "rotten_tomatoes": {"name": "rotten_tomatoes", "description": "Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews. This data was first used in Bo\nPang and Lillian Lee, ``Seeing stars: Exploiting class relationships for\nsentiment categorization with respect to rating scales.'', Proceedings of the\nACL, 2005.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1", "args": {"average": "binary"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "RussianNLP/russian_super_glue": {"name": "RussianNLP/russian_super_glue", "description": "Recent advances in the field of universal language models and transformers require the development of a methodology for\ntheir broad diagnostics and testing for general intellectual skills - detection of natural language inference,\ncommonsense reasoning, ability to perform simple logical operations regardless of text subject or lexicon. For the first\ntime, a benchmark of nine tasks, collected and organized analogically to the SuperGLUE methodology, was developed from\nscratch for the Russian language. We provide baselines, human level evaluation, an open-source framework for evaluating\nmodels and an overall leaderboard of transformer models for the Russian language.", "evaluation_metadata": {}}, "samsum": {"name": "samsum", "description": "SAMSum Corpus contains over 16k chat dialogues with manually annotated\nsummaries.\nThere are two features:\n - dialogue: text of dialogue.\n - summary: human written summary of the dialogue.\n - id: id of a example.", "evaluation_metadata": [{"config": "samsum", "task": "summarization", "task_id": "summarization", "splits": {"eval_split": "test"}, "col_mapping": {"dialogue": "text", "summary": "target"}}]}, "sanskrit_classic": {"name": "sanskrit_classic", "description": "This dataset combines some of the classical Sanskrit texts.", "evaluation_metadata": {}}, "saudinewsnet": {"name": "saudinewsnet", "description": "The dataset contains a set of 31,030 Arabic newspaper articles alongwith metadata, extracted from various online Saudi newspapers and written in MSA.", "evaluation_metadata": {}}, "sberquad": {"name": "sberquad", "description": "Sber Question Answering Dataset (SberQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. Russian original analogue presented in Sberbank Data Science Journey 2017.", "evaluation_metadata": {}}, "scan": {"name": "scan", "description": "SCAN tasks with various splits.\n\nSCAN is a set of simple language-driven navigation tasks for studying\ncompositional learning and zero-shot generalization.\n\nSee https://github.com/brendenlake/SCAN for a description of the splits.\n\nExample usage:\ndata = datasets.load_dataset('scan/length')", "evaluation_metadata": {}}, "scb_mt_enth_2020": {"name": "scb_mt_enth_2020", "description": "scb-mt-en-th-2020: A Large English-Thai Parallel Corpus\nThe primary objective of our work is to build a large-scale English-Thai dataset for machine translation.\nWe construct an English-Thai machine translation dataset with over 1 million segment pairs, curated from various sources,\nnamely news, Wikipedia articles, SMS messages, task-based dialogs, web-crawled data and government documents.\nMethodology for gathering data, building parallel texts and removing noisy sentence pairs are presented in a reproducible manner.\nWe train machine translation models based on this dataset. Our models' performance are comparable to that of\nGoogle Translation API (as of May 2020) for Thai-English and outperform Google when the Open Parallel Corpus (OPUS) is\nincluded in the training data for both Thai-English and English-Thai translation.\nThe dataset, pre-trained models, and source code to reproduce our work are available for public use.", "evaluation_metadata": {}}, "scene_parse_150": {"name": "scene_parse_150", "description": "Scene parsing is to segment and parse an image into different image regions associated with semantic categories, such as sky, road, person, and bed.\nMIT Scene Parsing Benchmark (SceneParse150) provides a standard training and evaluation platform for the algorithms of scene parsing.\nThe data for this benchmark comes from ADE20K Dataset which contains more than 20K scene-centric images exhaustively annotated with objects and object parts.\nSpecifically, the benchmark is divided into 20K images for training, 2K images for validation, and another batch of held-out images for testing.\nThere are totally 150 semantic categories included for evaluation, which include stuffs like sky, road, grass, and discrete objects like person, car, bed.\nNote that there are non-uniform distribution of objects occuring in the images, mimicking a more natural object occurrence in daily scene.", "evaluation_metadata": {}}, "schema_guided_dstc8": {"name": "schema_guided_dstc8", "description": "The Schema-Guided Dialogue dataset (SGD) was developed for the Dialogue State Tracking task of the Eights Dialogue Systems Technology Challenge (dstc8).\nThe SGD dataset consists of over 18k annotated multi-domain, task-oriented conversations between a human and a virtual assistant.\nThese conversations involve interactions with services and APIs spanning 17 domains, ranging from banks and events to media, calendar, travel, and weather.\nFor most of these domains, the SGD dataset contains multiple different APIs, many of which have overlapping functionalities but different interfaces,\nwhich reflects common real-world scenarios.", "evaluation_metadata": {}}, "allenai/scicite": {"name": "allenai/scicite", "description": "This is a dataset for classifying citation intents in academic papers.\nThe main citation intent label for each Json object is specified with the label\nkey while the citation context is specified in with a context key. Example:\n{\n 'string': 'In chacma baboons, male-infant relationships can be linked to both\n formation of friendships and paternity success [30,31].'\n 'sectionName': 'Introduction',\n 'label': 'background',\n 'citingPaperId': '7a6b2d4b405439',\n 'citedPaperId': '9d1abadc55b5e0',\n ...\n }\nYou may obtain the full information about the paper using the provided paper ids\nwith the Semantic Scholar API (https://api.semanticscholar.org/).\nThe labels are:\nMethod, Background, Result", "evaluation_metadata": {}}, "scielo": {"name": "scielo", "description": "A parallel corpus of full-text scientific articles collected from Scielo database in the following languages: English, Portuguese and Spanish. The corpus is sentence aligned for all language pairs, as well as trilingual aligned for a small subset of sentences. Alignment was carried out using the Hunalign algorithm.", "evaluation_metadata": {}}, "scientific_papers": {"name": "scientific_papers", "description": "Scientific papers datasets contains two sets of long and structured documents.\nThe datasets are obtained from ArXiv and PubMed OpenAccess repositories.\n\nBoth \"arxiv\" and \"pubmed\" have two features:\n - article: the body of the document, pagragraphs seperated by \"/n\".\n - abstract: the abstract of the document, pagragraphs seperated by \"/n\".\n - section_names: titles of sections, seperated by \"/n\".", "evaluation_metadata": {}}, "allenai/scifact": {"name": "allenai/scifact", "description": "SciFact, a dataset of 1.4K expert-written scientific claims paired with evidence-containing abstracts, and annotated with labels and rationales.", "evaluation_metadata": {}}, "sciq": {"name": "sciq", "description": "The SciQ dataset contains 13,679 crowdsourced science exam questions about Physics, Chemistry and Biology, among others. The questions are in multiple-choice format with 4 answer options each. For the majority of the questions, an additional paragraph with supporting evidence for the correct answer is provided.", "evaluation_metadata": {}}, "scitail": {"name": "scitail", "description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and web sentences. Each question\nand the correct answer choice are converted into an assertive statement to form the hypothesis. We use information\nretrieval to obtain relevant text from a large text corpus of web sentences, and use these sentences as a premise P. We\ncrowdsource the annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order to create\nthe SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with entails label and 16,925 examples\nwith neutral label", "evaluation_metadata": {}}, "allenai/scitldr": {"name": "allenai/scitldr", "description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.", "evaluation_metadata": {}}, "search_qa": {"name": "search_qa", "description": "We publicly release a new large-scale dataset, called SearchQA, for machine comprehension, or question-answering. Unlike recently released datasets, such as DeepMind\nCNN/DailyMail and SQuAD, the proposed SearchQA was constructed to reflect a full pipeline of general question-answering. That is, we start not from an existing article\nand generate a question-answer pair, but start from an existing question-answer pair, crawled from J! Archive, and augment it with text snippets retrieved by Google.\nFollowing this approach, we built SearchQA, which consists of more than 140k question-answer pairs with each pair having 49.6 snippets on average. Each question-answer-context\n tuple of the SearchQA comes with additional meta-data such as the snippet's URL, which we believe will be valuable resources for future research. We conduct human evaluation\n as well as test two baseline methods, one simple word selection and the other deep learning based, on the SearchQA. We show that there is a meaningful gap between the human\n and machine performances. This suggests that the proposed dataset could well serve as a benchmark for question-answering.", "evaluation_metadata": {}}, "sede": {"name": "sede", "description": "SEDE (Stack Exchange Data Explorer) is new dataset for Text-to-SQL tasks with more than 12,000 SQL queries and their\nnatural language description. It's based on a real usage of users from the Stack Exchange Data Explorer platform,\nwhich brings complexities and challenges never seen before in any other semantic parsing dataset like\nincluding complex nesting, dates manipulation, numeric and text manipulation, parameters, and most\nimportantly: under-specification and hidden-assumptions.\n\nPaper (NLP4Prog workshop at ACL2021): https://arxiv.org/abs/2106.05006", "evaluation_metadata": {}}, "selqa": {"name": "selqa", "description": "The SelQA dataset provides crowdsourced annotation for two selection-based question answer tasks,\nanswer sentence selection and answer triggering.", "evaluation_metadata": {}}, "sem_eval_2010_task_8": {"name": "sem_eval_2010_task_8", "description": "The SemEval-2010 Task 8 focuses on Multi-way classification of semantic relations between pairs of nominals.\nThe task was designed to compare different approaches to semantic relation classification\nand to provide a standard testbed for future research.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"sentence": "text", "relation": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "sem_eval_2014_task_1": {"name": "sem_eval_2014_task_1", "description": "The SemEval-2014 Task 1 focuses on Evaluation of Compositional Distributional Semantic Models\non Full Sentences through Semantic Relatedness and Entailment. The task was designed to\npredict the degree of relatedness between two sentences and to detect the entailment\nrelation holding between them.", "evaluation_metadata": {}}, "sem_eval_2018_task_1": {"name": "sem_eval_2018_task_1", "description": " SemEval-2018 Task 1: Affect in Tweets: SubTask 5: Emotion Classification.\n This is a dataset for multilabel emotion classification for tweets.\n 'Given a tweet, classify it as 'neutral or no emotion' or as one, or more, of eleven given emotions that best represent the mental state of the tweeter.'\n It contains 22467 tweets in three languages manually annotated by crowdworkers using Best\u2013Worst Scaling.", "evaluation_metadata": {}}, "sent_comp": {"name": "sent_comp", "description": "Large corpus of uncompressed and compressed sentences from news articles.", "evaluation_metadata": {}}, "senti_lex": {"name": "senti_lex", "description": "This dataset add sentiment lexicons for 81 languages generated via graph propagation based on a knowledge graph--a graphical representation of real-world entities and the links between them.", "evaluation_metadata": {}}, "sentiment140": {"name": "sentiment140", "description": "Sentiment140 consists of Twitter messages with emoticons, which are used as noisy labels for\nsentiment classification. For more detailed information please refer to the paper.", "evaluation_metadata": [{"config": "sentiment140", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "sentiment": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "sepedi_ner": {"name": "sepedi_ner", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "evaluation_metadata": {}}, "sesotho_ner_corpus": {"name": "sesotho_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "evaluation_metadata": {}}, "setimes": {"name": "setimes", "description": "SETimes \u2013 A Parallel Corpus of English and South-East European Languages\nThe corpus is based on the content published on the SETimes.com news portal. The news portal publishes \u201cnews and views from Southeast Europe\u201d in ten languages: Bulgarian, Bosnian, Greek, English, Croatian, Macedonian, Romanian, Albanian and Serbian. This version of the corpus tries to solve the issues present in an older version of the corpus (published inside OPUS, described in the LREC 2010 paper by Francis M. Tyers and Murat Serdar Alperen). The following procedures were applied to resolve existing issues:\n\n- stricter extraction process \u2013 no HTML residues present\n- language identification on every non-English document \u2013 non-English online documents contain English material in case the article was not translated into that language\n- resolving encoding issues in Croatian and Serbian \u2013 diacritics were partially lost due to encoding errors \u2013 text was rediacritized.", "evaluation_metadata": {}}, "setswana_ner_corpus": {"name": "setswana_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "evaluation_metadata": {}}, "sharc": {"name": "sharc", "description": "ShARC is a Conversational Question Answering dataset focussing on question answering from texts containing rules. The goal is to answer questions by possibly asking follow-up questions first. It is assumed assume that the question is often underspecified, in the sense that the question does not provide enough information to be answered directly. However, an agent can use the supporting rule text to infer what needs to be asked in order to determine the final answer.", "evaluation_metadata": {}}, "sharc_modified": {"name": "sharc_modified", "description": "ShARC, a conversational QA task, requires a system to answer user questions based on rules expressed in natural language text. However, it is found that in the ShARC dataset there are multiple spurious patterns that could be exploited by neural models. SharcModified is a new dataset which reduces the patterns identified in the original dataset. To reduce the sensitivity of neural models, for each occurence of an instance conforming to any of the patterns, we automatically construct alternatives where we choose to either replace the current instance with an alternative instance which does not exhibit the pattern; or retain the original instance. The modified ShARC has two versions sharc-mod and history-shuffled. For morre details refer to Appendix A.3 .", "evaluation_metadata": {}}, "sick": {"name": "sick", "description": "Shared and internationally recognized benchmarks are fundamental for the development of any computational system.\nWe aim to help the research community working on compositional distributional semantic models (CDSMs) by providing SICK (Sentences Involving Compositional Knowldedge), a large size English benchmark tailored for them.\nSICK consists of about 10,000 English sentence pairs that include many examples of the lexical, syntactic and semantic phenomena that CDSMs are expected to account for, but do not require dealing with other aspects of existing sentential data sets (idiomatic multiword expressions, named entities, telegraphic language) that are not within the scope of CDSMs.\nBy means of crowdsourcing techniques, each pair was annotated for two crucial semantic tasks: relatedness in meaning (with a 5-point rating scale as gold score) and entailment relation between the two elements (with three possible gold labels: entailment, contradiction, and neutral).\nThe SICK data set was used in SemEval-2014 Task 1, and it freely available for research purposes.", "evaluation_metadata": {}}, "silicone": {"name": "silicone", "description": "The Sequence labellIng evaLuatIon benChmark fOr spoken laNguagE (SILICONE) benchmark is a collection\n of resources for training, evaluating, and analyzing natural language understanding systems\n specifically designed for spoken language. All datasets are in the English language and cover a\n variety of domains including daily life, scripted scenarios, joint task completion, phone call\n conversations, and televsion dialogue. Some datasets additionally include emotion and/or sentimant\n labels.", "evaluation_metadata": {}}, "simple_questions_v2": {"name": "simple_questions_v2", "description": "SimpleQuestions is a dataset for simple QA, which consists\nof a total of 108,442 questions written in natural language by human\nEnglish-speaking annotators each paired with a corresponding fact,\nformatted as (subject, relationship, object), that provides the answer\nbut also a complete explanation. Fast have been extracted from the\nKnowledge Base Freebase (freebase.com). We randomly shuffle these\nquestions and use 70% of them (75910) as training set, 10% as\nvalidation set (10845), and the remaining 20% as test set.", "evaluation_metadata": {}}, "siswati_ner_corpus": {"name": "siswati_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "evaluation_metadata": {}}, "smartdata": {"name": "smartdata", "description": "DFKI SmartData Corpus is a dataset of 2598 German-language documents\nwhich has been annotated with fine-grained geo-entities, such as streets,\nstops and routes, as well as standard named entity types. It has also\nbeen annotated with a set of 15 traffic- and industry-related n-ary\nrelations and events, such as Accidents, Traffic jams, Acquisitions,\nand Strikes. The corpus consists of newswire texts, Twitter messages,\nand traffic reports from radio stations, police and railway companies.\nIt allows for training and evaluating both named entity recognition\nalgorithms that aim for fine-grained typing of geo-entities, as well\nas n-ary relation extraction systems.", "evaluation_metadata": {}}, "sms_spam": {"name": "sms_spam", "description": "The SMS Spam Collection v.1 is a public set of SMS labeled messages that have been collected for mobile phone spam research.\nIt has one collection composed by 5,574 English, real and non-enconded messages, tagged according being legitimate (ham) or spam.", "evaluation_metadata": [{"config": "plain_text", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train"}, "col_mapping": {"sms": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "snips_built_in_intents": {"name": "snips_built_in_intents", "description": "Snips' built in intents dataset was initially used to compare different voice assistants and released as a public dataset hosted at\nhttps://github.com/sonos/nlu-benchmark 2016-12-built-in-intents. The dataset contains 328 utterances over 10 intent classes. The\nrelated paper mentioned on the github page is https://arxiv.org/abs/1805.10190 and a related Medium post is\nhttps://medium.com/snips-ai/benchmarking-natural-language-understanding-systems-d35be6ce568d .", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "train_split": "train", "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "snli": {"name": "snli", "description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English\nsentence pairs manually labeled for balanced classification with the labels\nentailment, contradiction, and neutral, supporting the task of natural language\ninference (NLI), also known as recognizing textual entailment (RTE).", "evaluation_metadata": {}}, "snow_simplified_japanese_corpus": {"name": "snow_simplified_japanese_corpus", "description": "About SNOW T15: The simplified corpus for the Japanese language. The corpus has 50,000 manually simplified and aligned sentences. This corpus contains the original sentences, simplified sentences and English translation of the original sentences. It can be used for automatic text simplification as well as translating simple Japanese into English and vice-versa. The core vocabulary is restricted to 2,000 words where it is selected by accounting for several factors such as meaning preservation, variation, simplicity and the UniDic word segmentation criterion.\nFor details, refer to the explanation page of Japanese simplification (http://www.jnlp.org/research/Japanese_simplification). The original texts are from \"small_parallel_enja: 50k En/Ja Parallel Corpus for Testing SMT Methods\", which is a bilingual corpus for machine translation. About SNOW T23: An expansion corpus of 35,000 sentences rewritten in easy Japanese (simple Japanese vocabulary) based on SNOW T15. The original texts are from \"Tanaka Corpus\" (http://www.edrdg.org/wiki/index.php/Tanaka_Corpus).", "evaluation_metadata": {}}, "social_bias_frames": {"name": "social_bias_frames", "description": "Social Bias Frames is a new way of representing the biases and offensiveness that are implied in language.\nFor example, these frames are meant to distill the implication that \"women (candidates) are less qualified\"\nbehind the statement \"we shouldn\u2019t lower our standards to hire more women.\"", "evaluation_metadata": {}}, "social_i_qa": {"name": "social_i_qa", "description": "We introduce Social IQa: Social Interaction QA, a new question-answering benchmark for testing social commonsense intelligence. Contrary to many prior benchmarks that focus on physical or taxonomic knowledge, Social IQa focuses on reasoning about people\u2019s actions and their social implications. For example, given an action like \"Jesse saw a concert\" and a question like \"Why did Jesse do this?\", humans can easily infer that Jesse wanted \"to see their favorite performer\" or \"to enjoy the music\", and not \"to see what's happening inside\" or \"to see if it works\". The actions in Social IQa span a wide variety of social situations, and answer candidates contain both human-curated answers and adversarially-filtered machine-generated candidates. Social IQa contains over 37,000 QA pairs for evaluating models\u2019 abilities to reason about the social implications of everyday events and situations. (Less)", "evaluation_metadata": {}}, "sofc_materials_articles": {"name": "sofc_materials_articles", "description": "The SOFC-Exp corpus consists of 45 open-access scholarly articles annotated by domain experts.\nA corpus and an inter-annotator agreement study demonstrate the complexity of the suggested\nnamed entity recognition and slot filling tasks as well as high annotation quality is presented\nin the accompanying paper.", "evaluation_metadata": {}}, "sogou_news": {"name": "sogou_news", "description": "The Sogou News dataset is a mixture of 2,909,551 news articles from the SogouCA and SogouCS news corpora, in 5 categories.\nThe number of training samples selected for each class is 90,000 and testing 12,000. Note that the Chinese characters have been converted to Pinyin.\nclassification labels of the news are determined by their domain names in the URL. For example, the news with\nURL http://sports.sohu.com is categorized as a sport class.", "evaluation_metadata": {}}, "spc": {"name": "spc", "description": "This is a collection of parallel corpora collected by Hercules Dalianis and his research group for bilingual dictionary construction.\nMore information in: Hercules Dalianis, Hao-chun Xing, Xin Zhang: Creating a Reusable English-Chinese Parallel Corpus for Bilingual Dictionary Construction, In Proceedings of LREC2010 (source: http://people.dsv.su.se/~hercules/SEC/) and Konstantinos Charitakis (2007): Using Parallel Corpora to Create a Greek-English Dictionary with UPLUG, In Proceedings of NODALIDA 2007. Afrikaans-English: Aldin Draghoender and Mattias Kanhov: Creating a reusable English \u2013 Afrikaans parallel corpora for bilingual dictionary construction\n\n4 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 1.32M\ntotal number of sentence fragments: 0.15M", "evaluation_metadata": {}}, "species_800": {"name": "species_800", "description": "We have developed an efficient algorithm and implementation of a dictionary-based approach to named entity recognition,\nwhich we here use to identifynames of species and other taxa in text. The tool, SPECIES, is more than an order of\nmagnitude faster and as accurate as existing tools. The precision and recall was assessed both on an existing gold-standard\ncorpus and on a new corpus of 800 abstracts, which were manually annotated after the development of the tool. The corpus\ncomprises abstracts from journals selected to represent many taxonomic groups, which gives insights into which types of\norganism names are hard to detect and which are easy. Finally, we have tagged organism names in the entire Medline database\nand developed a web resource, ORGANISMS, that makes the results accessible to the broad community of biologists.", "evaluation_metadata": {}}, "speech_commands": {"name": "speech_commands", "description": "This is a set of one-second .wav audio files, each containing a single spoken\nEnglish word or background noise. These words are from a small set of commands, and are spoken by a\nvariety of different speakers. This data set is designed to help train simple\nmachine learning models. This dataset is covered in more detail at\n[https://arxiv.org/abs/1804.03209](https://arxiv.org/abs/1804.03209).\n\nVersion 0.01 of the data set (configuration `\"v0.01\"`) was released on August 3rd 2017 and contains\n64,727 audio files.\n\nIn version 0.01 thirty different words were recoded: \"Yes\", \"No\", \"Up\", \"Down\", \"Left\",\n\"Right\", \"On\", \"Off\", \"Stop\", \"Go\", \"Zero\", \"One\", \"Two\", \"Three\", \"Four\", \"Five\", \"Six\", \"Seven\", \"Eight\", \"Nine\",\n\"Bed\", \"Bird\", \"Cat\", \"Dog\", \"Happy\", \"House\", \"Marvin\", \"Sheila\", \"Tree\", \"Wow\".\n\n\nIn version 0.02 more words were added: \"Backward\", \"Forward\", \"Follow\", \"Learn\", \"Visual\".\n\nIn both versions, ten of them are used as commands by convention: \"Yes\", \"No\", \"Up\", \"Down\", \"Left\",\n\"Right\", \"On\", \"Off\", \"Stop\", \"Go\". Other words are considered to be auxiliary (in current implementation\nit is marked by `True` value of `\"is_unknown\"` feature). Their function is to teach a model to distinguish core words\nfrom unrecognized ones.\n\nThe `_silence_` class contains a set of longer audio clips that are either recordings or\na mathematical simulation of noise.", "evaluation_metadata": {}}, "spider": {"name": "spider", "description": "Spider is a large-scale complex and cross-domain semantic parsing and text-toSQL dataset annotated by 11 college students", "evaluation_metadata": {}}, "squad": {"name": "squad", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": [{"config": "plain_text", "task": "question-answering", "task_id": "extractive_question_answering", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "question", "context": "context", "answers": {"text": "text", "answer_start": "answer_start"}}, "metrics": [{"type": "squad", "name": "SQuAD"}]}]}, "squad_es": {"name": "squad_es", "description": "automatic translation of the Stanford Question Answering Dataset (SQuAD) v2 into Spanish", "evaluation_metadata": {}}, "squad_it": {"name": "squad_it", "description": "SQuAD-it is derived from the SQuAD dataset and it is obtained through semi-automatic translation of the SQuAD dataset\ninto Italian. It represents a large-scale dataset for open question answering processes on factoid questions in Italian.\n The dataset contains more than 60,000 question/answer pairs derived from the original English dataset. The dataset is\n split into training and test sets to support the replicability of the benchmarking of QA systems:", "evaluation_metadata": {}}, "squad_kor_v1": {"name": "squad_kor_v1", "description": "KorQuAD 1.0 is a large-scale Korean dataset for machine reading comprehension task consisting of human generated questions for Wikipedia articles. We benchmark the data collecting process of SQuADv1.0 and crowdsourced 70,000+ question-answer pairs. 1,637 articles and 70,079 pairs of question answers were collected. 1,420 articles are used for the training set, 140 for the dev set, and 77 for the test set. 60,407 question-answer pairs are for the training set, 5,774 for the dev set, and 3,898 for the test set.", "evaluation_metadata": {}}, "squad_kor_v2": {"name": "squad_kor_v2", "description": "KorQuAD 2.0 is a Korean question and answering dataset consisting of a total of 100,000+ pairs. There are three major differences from KorQuAD 1.0, which is the standard Korean Q & A data. The first is that a given document is a whole Wikipedia page, not just one or two paragraphs. Second, because the document also contains tables and lists, it is necessary to understand the document structured with HTML tags. Finally, the answer can be a long text covering not only word or phrase units, but paragraphs, tables, and lists. As a baseline model, BERT Multilingual is used, released by Google as an open source. It shows 46.0% F1 score, a very low score compared to 85.7% of the human F1 score. It indicates that this data is a challenging task. Additionally, we increased the performance by no-answer data augmentation. Through the distribution of this data, we intend to extend the limit of MRC that was limited to plain text to real world tasks of various lengths and formats.", "evaluation_metadata": {}}, "squad_v1_pt": {"name": "squad_v1_pt", "description": "Portuguese translation of the SQuAD dataset. The translation was performed automatically using the Google Cloud API.", "evaluation_metadata": {}}, "squad_v2": {"name": "squad_v2", "description": "combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers\n to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but\n also determine when no answer is supported by the paragraph and abstain from answering.", "evaluation_metadata": [{"config": "squad_v2", "task": "question-answering", "task_id": "extractive_question_answering", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "question", "context": "context", "answers": {"text": "text", "answer_start": "answer_start"}}, "metrics": [{"type": "squad_v2", "name": "SQuAD v2"}]}]}, "srwac": {"name": "srwac", "description": "The Serbian web corpus srWaC was built by crawling the .rs top-level domain in 2014. The corpus was near-deduplicated on paragraph level, normalised via diacritic restoration, morphosyntactically annotated and lemmatised. The corpus is shuffled by paragraphs. Each paragraph contains metadata on the URL, domain and language identification (Serbian vs. Croatian).\nVersion 1.0 of this corpus is described in http://www.aclweb.org/anthology/W14-0405. Version 1.1 contains newer and better linguistic annotations.", "evaluation_metadata": {}}, "sst": {"name": "sst", "description": "The Stanford Sentiment Treebank, the first corpus with fully labeled parse trees that allows for a\ncomplete analysis of the compositional effects of sentiment in language.", "evaluation_metadata": {}}, "stereoset": {"name": "stereoset", "description": "Stereoset is a dataset that measures stereotype bias in language models. Stereoset consists of 17,000 sentences that\nmeasures model preferences across gender, race, religion, and profession.", "evaluation_metadata": {}}, "stsb_multi_mt": {"name": "stsb_multi_mt", "description": "These are different multilingual translations and the English original of the STSbenchmark dataset. Translation has been done with deepl.com.", "evaluation_metadata": {}}, "subjqa": {"name": "subjqa", "description": "SubjQA is a question answering dataset that focuses on subjective questions and answers.\nThe dataset consists of roughly 10,000 questions over reviews from 6 different domains: books, movies, grocery,\nelectronics, TripAdvisor (i.e. hotels), and restaurants.", "evaluation_metadata": {}}, "super_glue": {"name": "super_glue", "description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.", "evaluation_metadata": {}}, "superb": {"name": "superb", "description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .wav format and is not converted to a float32 array. To\nconvert the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "svhn": {"name": "svhn", "description": "SVHN is a real-world image dataset for developing machine learning and object recognition algorithms with minimal requirement on data preprocessing and formatting.\nIt can be seen as similar in flavor to MNIST (e.g., the images are of small cropped digits), but incorporates an order of magnitude more labeled data (over 600,000 digit images)\nand comes from a significantly harder, unsolved, real world problem (recognizing digits and numbers in natural scene images). SVHN is obtained from house numbers in Google Street View images.", "evaluation_metadata": {}}, "swag": {"name": "swag", "description": "Given a partial description like \"she opened the hood of the car,\"\nhumans can reason about the situation and anticipate what might come\nnext (\"then, she examined the engine\"). SWAG (Situations With Adversarial Generations)\nis a large-scale dataset for this task of grounded commonsense\ninference, unifying natural language inference and physically grounded reasoning.\n\nThe dataset consists of 113k multiple choice questions about grounded situations\n(73k training, 20k validation, 20k test).\nEach question is a video caption from LSMDC or ActivityNet Captions,\nwith four answer choices about what might happen next in the scene.\nThe correct answer is the (real) video caption for the next event in the video;\nthe three incorrect answers are adversarially generated and human verified,\nso as to fool machines but not humans. SWAG aims to be a benchmark for\nevaluating grounded commonsense NLI and for learning representations.\n\nThe full data contain more information,\nbut the regular configuration will be more interesting for modeling\n(note that the regular data are shuffled). The test set for leaderboard submission\nis under the regular configuration.", "evaluation_metadata": {}}, "swahili": {"name": "swahili", "description": "The Swahili dataset developed specifically for language modeling task.\nThe dataset contains 28,000 unique words with 6.84M, 970k, and 2M words for the train,\nvalid and test partitions respectively which represent the ratio 80:10:10.\nThe entire dataset is lowercased, has no punctuation marks and,\nthe start and end of sentence markers have been incorporated to facilitate easy tokenization during language modeling.", "evaluation_metadata": {}}, "swahili_news": {"name": "swahili_news", "description": "Swahili is spoken by 100-150 million people across East Africa. In Tanzania, it is one of two national languages (the other is English) and it is the official language of instruction in all schools. News in Swahili is an important part of the media sphere in Tanzania.\n\nNews contributes to education, technology, and the economic growth of a country, and news in local languages plays an important cultural role in many Africa countries. In the modern age, African languages in news and other spheres are at risk of being lost as English becomes the dominant language in online spaces.\n\nThe Swahili news dataset was created to reduce the gap of using the Swahili language to create NLP technologies and help AI practitioners in Tanzania and across Africa continent to practice their NLP skills to solve different problems in organizations or societies related to Swahili language. Swahili News were collected from different websites that provide news in the Swahili language. I was able to find some websites that provide news in Swahili only and others in different languages including Swahili.\n\nThe dataset was created for a specific task of text classification, this means each news content can be categorized into six different topics (Local news, International news , Finance news, Health news, Sports news, and Entertainment news). The dataset comes with a specified train/test split. The train set contains 75% of the dataset and test set contains 25% of the dataset.", "evaluation_metadata": {}}, "swda": {"name": "swda", "description": "The Switchboard Dialog Act Corpus (SwDA) extends the Switchboard-1 Telephone Speech Corpus, Release 2 with\nturn/utterance-level dialog-act tags. The tags summarize syntactic, semantic, and pragmatic information about the\nassociated turn. The SwDA project was undertaken at UC Boulder in the late 1990s.\nThe SwDA is not inherently linked to the Penn Treebank 3 parses of Switchboard, and it is far from straightforward to\nalign the two resources. In addition, the SwDA is not distributed with the Switchboard's tables of metadata about the\nconversations and their participants.", "evaluation_metadata": {}}, "swedish_medical_ner": {"name": "swedish_medical_ner", "description": "SwedMedNER is a dataset for training and evaluating Named Entity Recognition systems on medical texts in Swedish.\nIt is derived from medical articles on the Swedish Wikipedia, L\u00e4kartidningen, and 1177 V\u00e5rdguiden.", "evaluation_metadata": {}}, "swedish_ner_corpus": {"name": "swedish_ner_corpus", "description": "Webbnyheter 2012 from Spraakbanken, semi-manually annotated and adapted for CoreNLP Swedish NER. Semi-manually defined in this case as: Bootstrapped from Swedish Gazetters then manually correcte/reviewed by two independent native speaking swedish annotators. No annotator agreement calculated.", "evaluation_metadata": {}}, "rcds/swiss_judgment_prediction": {"name": "rcds/swiss_judgment_prediction", "description": "Swiss-Judgment-Prediction is a multilingual, diachronic dataset of 85K Swiss Federal Supreme Court (FSCS) cases annotated with the respective binarized judgment outcome (approval/dismissal), posing a challenging text classification task. We also provide additional metadata, i.e., the publication year, the legal area and the canton of origin per case, to promote robustness and fairness studies on the critical area of legal NLP.", "evaluation_metadata": {}}, "tab_fact": {"name": "tab_fact", "description": "The problem of verifying whether a textual hypothesis holds the truth based on the given evidence, also known as fact verification, plays an important role in the study of natural language understanding and semantic representation. However, existing studies are restricted to dealing with unstructured textual evidence (e.g., sentences and passages, a pool of passages), while verification using structured forms of evidence, such as tables, graphs, and databases, remains unexplored. TABFACT is large scale dataset with 16k Wikipedia tables as evidence for 118k human annotated statements designed for fact verification with semi-structured evidence. The statements are labeled as either ENTAILED or REFUTED. TABFACT is challenging since it involves both soft linguistic reasoning and hard symbolic reasoning.", "evaluation_metadata": {}}, "tamilmixsentiment": {"name": "tamilmixsentiment", "description": "The first gold standard Tamil-English code-switched, sentiment-annotated corpus containing 15,744 comment posts from YouTube. Train: 11,335 Validation: 1,260 and Test: 3,149. This makes the largest general domain sentiment dataset for this relatively low-resource language with code-mixing phenomenon. The dataset contains all the three types of code-mixed sentences - Inter-Sentential switch, Intra-Sentential switch and Tag switching. Most comments were written in Roman script with either Tamil grammar with English lexicon or English grammar with Tamil lexicon. Some comments were written in Tamil script with English expressions in between.", "evaluation_metadata": {}}, "tanzil": {"name": "tanzil", "description": "This is a collection of Quran translations compiled by the Tanzil project\nThe translations provided at this page are for non-commercial purposes only. If used otherwise, you need to obtain necessary permission from the translator or the publisher.\n\nIf you are using more than three of the following translations in a website or application, we require you to put a link back to this page to make sure that subsequent users have access to the latest updates.\n\n42 languages, 878 bitexts\ntotal number of files: 105\ntotal number of tokens: 22.33M\ntotal number of sentence fragments: 1.01M", "evaluation_metadata": {}}, "tapaco": {"name": "tapaco", "description": "A freely available paraphrase corpus for 73 languages extracted from the Tatoeba database. Tatoeba is a crowdsourcing project mainly geared towards language learners. Its aim is to provide example sentences and translations for particular linguistic constructions and words. The paraphrase corpus is created by populating a graph with Tatoeba sentences and equivalence links between sentences \u201cmeaning the same thing\u201d. This graph is then traversed to extract sets of paraphrases. Several language-independent filters and pruning steps are applied to remove uninteresting sentences. A manual evaluation performed on three languages shows that between half and three quarters of inferred paraphrases are correct and that most remaining ones are either correct but trivial, or near-paraphrases that neutralize a morphological distinction. The corpus contains a total of 1.9 million sentences, with 200 \u2013 250 000 sentences per language. It covers a range of languages for which, to our knowledge,no other paraphrase dataset exists.", "evaluation_metadata": {}}, "tashkeela": {"name": "tashkeela", "description": "Arabic vocalized texts.\nit contains 75 million of fully vocalized words mainly97 books from classical and modern Arabic language.", "evaluation_metadata": {}}, "taskmaster1": {"name": "taskmaster1", "description": "Taskmaster-1 is a goal-oriented conversational dataset. It includes 13,215 task-based dialogs comprising six domains. Two procedures were used to create this collection, each with unique advantages. The first involves a two-person, spoken \"Wizard of Oz\" (WOz) approach in which trained agents and crowdsourced workers interact to complete the task while the second is \"self-dialog\" in which crowdsourced workers write the entire dialog themselves.", "evaluation_metadata": {}}, "taskmaster2": {"name": "taskmaster2", "description": "Taskmaster is dataset for goal oriented conversations. The Taskmaster-2 dataset consists of 17,289 dialogs in the seven domains which include restaurants, food ordering, movies, hotels, flights, music and sports. Unlike Taskmaster-1, which includes both written \"self-dialogs\" and spoken two-person dialogs, Taskmaster-2 consists entirely of spoken two-person dialogs. In addition, while Taskmaster-1 is almost exclusively task-based, Taskmaster-2 contains a good number of search- and recommendation-oriented dialogs. All dialogs in this release were created using a Wizard of Oz (WOz) methodology in which crowdsourced workers played the role of a 'user' and trained call center operators played the role of the 'assistant'. In this way, users were led to believe they were interacting with an automated system that \u201cspoke\u201d using text-to-speech (TTS) even though it was in fact a human behind the scenes. As a result, users could express themselves however they chose in the context of an automated interface.", "evaluation_metadata": {}}, "taskmaster3": {"name": "taskmaster3", "description": "Taskmaster is dataset for goal oriented conversations. The Taskmaster-3 dataset consists of 23,757 movie ticketing dialogs. By \"movie ticketing\" we mean conversations where the customer's goal is to purchase tickets after deciding on theater, time, movie name, number of tickets, and date, or opt out of the transaction. This collection was created using the \"self-dialog\" method. This means a single, crowd-sourced worker is paid to create a conversation writing turns for both speakers, i.e. the customer and the ticketing agent.", "evaluation_metadata": {}}, "tatoeba": {"name": "tatoeba", "description": "This is a collection of translated sentences from Tatoeba\n359 languages, 3,403 bitexts\ntotal number of files: 750\ntotal number of tokens: 65.54M\ntotal number of sentence fragments: 8.96M", "evaluation_metadata": {}}, "ted_hrlr": {"name": "ted_hrlr", "description": "Data sets derived from TED talk transcripts for comparing similar language pairs\nwhere one is high resource and the other is low resource.", "evaluation_metadata": {}}, "ted_iwlst2013": {"name": "ted_iwlst2013", "description": "A parallel corpus of TED talk subtitles provided by CASMACAT: http://www.casmacat.eu/corpus/ted2013.html. The files are originally provided by https://wit3.fbk.eu.\n\n15 languages, 14 bitexts\ntotal number of files: 28\ntotal number of tokens: 67.67M\ntotal number of sentence fragments: 3.81M", "evaluation_metadata": {}}, "ted_multi": {"name": "ted_multi", "description": "Massively multilingual (60 language) data set derived from TED Talk transcripts.\nEach record consists of parallel arrays of language and text. Missing and\nincomplete translations will be filtered out.", "evaluation_metadata": {}}, "ted_talks_iwslt": {"name": "ted_talks_iwslt", "description": "The core of WIT3 is the TED Talks corpus, that basically redistributes the original content published by the TED Conference website (http://www.ted.com). Since 2007,\nthe TED Conference, based in California, has been posting all video recordings of its talks together with subtitles in English\nand their translations in more than 80 languages. Aside from its cultural and social relevance, this content, which is published under the Creative Commons BYNC-ND license, also represents a precious\nlanguage resource for the machine translation research community, thanks to its size, variety of topics, and covered languages.\nThis effort repurposes the original content in a way which is more convenient for machine translation researchers.", "evaluation_metadata": {}}, "tep_en_fa_para": {"name": "tep_en_fa_para", "description": "TEP: Tehran English-Persian parallel corpus. The first free Eng-Per corpus, provided by the Natural Language and Text Processing Laboratory, University of Tehran.", "evaluation_metadata": {}}, "text2log": {"name": "text2log", "description": "The dataset contains about 100,000 simple English sentences selected and filtered from enTenTen15 and their translation into First Order Logic (FOL) Lambda Dependency-based Compositional Semantics using ccg2lambda.", "evaluation_metadata": {}}, "thai_toxicity_tweet": {"name": "thai_toxicity_tweet", "description": "Thai Toxicity Tweet Corpus contains 3,300 tweets annotated by humans with guidelines including a 44-word dictionary.\nThe author obtained 2,027 and 1,273 toxic and non-toxic tweets, respectively; these were labeled by three annotators. The result of corpus\nanalysis indicates that tweets that include toxic words are not always toxic. Further, it is more likely that a tweet is toxic, if it contains\ntoxic words indicating their original meaning. Moreover, disagreements in annotation are primarily because of sarcasm, unclear existing\ntarget, and word sense ambiguity.\n\nNotes from data cleaner: The data is included into [huggingface/datasets](https://www.github.com/huggingface/datasets) in Dec 2020.\nBy this time, 506 of the tweets are not available publicly anymore. We denote these by `TWEET_NOT_FOUND` in `tweet_text`.\nProcessing can be found at [this PR](https://github.com/tmu-nlp/ThaiToxicityTweetCorpus/pull/1).", "evaluation_metadata": {}}, "thainer": {"name": "thainer", "description": "ThaiNER (v1.3) is a 6,456-sentence named entity recognition dataset created from expanding the 2,258-sentence\n[unnamed dataset](http://pioneer.chula.ac.th/~awirote/Data-Nutcha.zip) by\n[Tirasaroj and Aroonmanakun (2012)](http://pioneer.chula.ac.th/~awirote/publications/).\nIt is used to train NER taggers in [PyThaiNLP](https://github.com/PyThaiNLP/pythainlp).\nThe NER tags are annotated by [Tirasaroj and Aroonmanakun (2012)]((http://pioneer.chula.ac.th/~awirote/publications/))\nfor 2,258 sentences and the rest by [@wannaphong](https://github.com/wannaphong/).\nThe POS tags are done by [PyThaiNLP](https://github.com/PyThaiNLP/pythainlp)'s `perceptron` engine trained on `orchid_ud`.\n[@wannaphong](https://github.com/wannaphong/) is now the only maintainer of this dataset.", "evaluation_metadata": {}}, "thaiqa_squad": {"name": "thaiqa_squad", "description": "`thaiqa_squad` is an open-domain, extractive question answering dataset (4,000 questions in `train` and 74 questions in `dev`) in\n[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) format, originally created by [NECTEC](https://www.nectec.or.th/en/) from\nWikipedia articles and adapted to [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) format by [PyThaiNLP](https://github.com/PyThaiNLP/).", "evaluation_metadata": {}}, "thaisum": {"name": "thaisum", "description": "ThaiSum is a large-scale corpus for Thai text summarization obtained from several online news websites namely Thairath,\nThaiPBS, Prachathai, and The Standard. This dataset consists of over 350,000 article and summary pairs\nwritten by journalists.", "evaluation_metadata": {}}, "EleutherAI/pile": {"name": "EleutherAI/pile", "description": "The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality\ndatasets combined together.", "evaluation_metadata": {}}, "the_pile_books3": {"name": "the_pile_books3", "description": "This dataset is Shawn Presser's work and is part of EleutherAi/The Pile dataset. This dataset contains all of bibliotik in plain .txt form, aka 197,000 books processed in exactly the same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know very little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.", "evaluation_metadata": {}}, "the_pile_openwebtext2": {"name": "the_pile_openwebtext2", "description": "OpenWebText2 is part of EleutherAi/The Pile dataset and is an enhanced version of the original OpenWebTextCorpus covering all Reddit submissions from 2005 up until April 2020, with further months becoming available after the corresponding PushShift dump files are released.", "evaluation_metadata": {}}, "the_pile_stack_exchange": {"name": "the_pile_stack_exchange", "description": "This dataset is part of EleutherAI/The Pile dataset and is a dataset for Language Models from processing stackexchange data dump, which is an anonymized dump of all user-contributed content on the Stack Exchange network.", "evaluation_metadata": {}}, "tilde_model": {"name": "tilde_model", "description": "This is the Tilde MODEL Corpus \u2013 Multilingual Open Data for European Languages.\n\nThe data has been collected from sites allowing free use and reuse of its content, as well as from Public Sector web sites. The activities have been undertaken as part of the ODINE Open Data Incubator for Europe, which aims to support the next generation of digital businesses and fast-track the development of new products and services. The corpus includes the following parts:\nTilde MODEL - EESC is a multilingual corpus compiled from document texts of European Economic and Social Committee document portal. Source: http://dm.eesc.europa.eu/\nTilde MODEL - RAPID multilingual parallel corpus is compiled from all press releases of Press Release Database of European Commission released between 1975 and end of 2016 as available from http://europa.eu/rapid/\nTilde MODEL - ECB multilingual parallel corpus is compiled from the multilingual pages of European Central Bank web site http://ebc.europa.eu/\nTilde MODEL - EMA is a corpus compiled from texts of European Medicines Agency document portal as available in http://www.ema.europa.eu/ at the end of 2016\nTilde MODEL - World Bank is a corpus compiled from texts of World Bank as available in http://www.worldbank.org/ in 2017\nTilde MODEL - AirBaltic.com Travel Destinations is a multilingual parallel corpus compiled from description texts of AirBaltic.com travel destinations as available in https://www.airbaltic.com/en/destinations/ in 2017\nTilde MODEL - LiveRiga.com is a multilingual parallel corpus compiled from Riga tourist attractions description texts of http://liveriga.com/ web site in 2017\nTilde MODEL - Lithuanian National Philharmonic Society is a parallel corpus compiled from texts of Lithuanian National Philharmonic Society web site http://www.filharmonija.lt/ in 2017\nTilde MODEL - mupa.hu is a parallel corpus from texts of M\u00fcpa Budapest - web site of Hungarian national culture house and concert venue https://www.mupa.hu/en/ compiled in spring of 2017\nTilde MODEL - fold.lv is a parallel corpus from texts of fold.lv portal http://www.fold.lv/en/ of the best of Latvian and foreign creative industries as compiled in spring of 2017\nTilde MODEL - czechtourism.com is a multilingual parallel corpus from texts of http://czechtourism.com/ portal compiled in spring of 2017\n30 languages, 274 bitexts\ntotal number of files: 125\ntotal number of tokens: 1.43G\ntotal number of sentence fragments: 62.44M", "evaluation_metadata": {}}, "time_dial": {"name": "time_dial", "description": "TimeDial presents a crowdsourced English challenge set, for temporal commonsense reasoning, formulated\nas a multiple choice cloze task with around 1.5k carefully curated dialogs. The dataset is derived from\nthe DailyDialog (Li et al., 2017), which is a multi-turn dialog corpus.\n\nIn order to establish strong baselines and provide information on future model development, we\nconducted extensive experiments with state-of-the-art LMs. While humans can easily answer these\nquestions (97.8%), the best T5 model variant struggles on this challenge set (73%). Moreover, our\nqualitative error analyses show that the models often rely on shallow, spurious features (particularly text\nmatching), instead of truly doing reasoning over the context.", "evaluation_metadata": {}}, "tlc": {"name": "tlc", "description": "Thai Literature Corpora (TLC): Corpora of machine-ingestible Thai classical literature texts.\n\nRelease: 6/25/19\n\nIt consists of two datasets:\n\n## TLC set\nIt is texts from [Vajirayana Digital Library](https://vajirayana.org/), stored by chapters and stanzas (non-tokenized).\n\ntlc v.2.0 (6/17/19 : a total of 34 documents, 292,270 lines, 31,790,734 characters)\ntlc v.1.0 (6/11/19 : a total of 25 documents, 113,981 lines, 28,775,761 characters)\n\n## TNHC set\nIt is texts from Thai National Historical Corpus, stored by lines (manually tokenized).\n\ntnhc v.1.0 (6/25/19 : a total of 47 documents, 756,478 lines, 13,361,142 characters)", "evaluation_metadata": {}}, "tmu_gfm_dataset": {"name": "tmu_gfm_dataset", "description": "A dataset for GEC metrics with manual evaluations of grammaticality, fluency, and meaning preservation for system outputs. More detail about the creation of the dataset can be found in Yoshimura et al. (2020).", "evaluation_metadata": {}}, "told-br": {"name": "told-br", "description": "ToLD-Br is the biggest dataset for toxic tweets in Brazilian Portuguese, crowdsourced\nby 42 annotators selected from a pool of 129 volunteers. Annotators were selected aiming\nto create a plural group in terms of demographics (ethnicity, sexual orientation, age, gender).\nEach tweet was labeled by three annotators in 6 possible categories:\nLGBTQ+phobia,Xenophobia, Obscene, Insult, Misogyny and Racism.", "evaluation_metadata": {}}, "totto": {"name": "totto", "description": "ToTTo is an open-domain English table-to-text dataset with over 120,000 training examples that proposes a controlled generation task: given a Wikipedia table and a set of highlighted table cells, produce a one-sentence description.", "evaluation_metadata": {}}, "trec": {"name": "trec", "description": "The Text REtrieval Conference (TREC) Question Classification dataset contains 5500 labeled questions in training set and another 500 for test set.\n\nThe dataset has 6 coarse class labels and 50 fine class labels. Average length of each sentence is 10, vocabulary size of 8700.\n\nData are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set. These questions were manually labeled.", "evaluation_metadata": {}}, "trivia_qa": {"name": "trivia_qa", "description": "TriviaqQA is a reading comprehension dataset containing over 650K\nquestion-answer-evidence triples. TriviaqQA includes 95K question-answer\npairs authored by trivia enthusiasts and independently gathered evidence\ndocuments, six per question on average, that provide high quality distant\nsupervision for answering the questions.", "evaluation_metadata": {}}, "tsac": {"name": "tsac", "description": "Tunisian Sentiment Analysis Corpus.\n\nAbout 17k user comments manually annotated to positive and negative polarities. This corpus is collected from Facebook users comments written on official pages of Tunisian radios and TV channels namely Mosaique FM, JawhraFM, Shemes FM, HiwarElttounsi TV and Nessma TV. The corpus is collected from a period spanning January 2015 until June 2016.", "evaluation_metadata": {}}, "ttc4900": {"name": "ttc4900", "description": "The data set is taken from kemik group\nhttp://www.kemik.yildiz.edu.tr/\nThe data are pre-processed for the text categorization, collocations are found, character set is corrected, and so forth.\nWe named TTC4900 by mimicking the name convention of TTC 3600 dataset shared by the study http://journals.sagepub.com/doi/abs/10.1177/0165551515620551\n\nIf you use the dataset in a paper, please refer https://www.kaggle.com/savasy/ttc4900 as footnote and cite one of the papers as follows:\n\n- A Comparison of Different Approaches to Document Representation in Turkish Language, SDU Journal of Natural and Applied Science, Vol 22, Issue 2, 2018\n- A comparative analysis of text classification for Turkish language, Pamukkale University Journal of Engineering Science Volume 25 Issue 5, 2018\n- A Knowledge-poor Approach to Turkish Text Categorization with a Comparative Analysis, Proceedings of CICLING 2014, Springer LNCS, Nepal, 2014.", "evaluation_metadata": {}}, "tunizi": {"name": "tunizi", "description": "On social media, Arabic speakers tend to express themselves in their own local dialect. To do so, Tunisians use \"Tunisian Arabizi\", which consists in supplementing numerals to the Latin script rather than the Arabic alphabet. TUNIZI is the first Tunisian Arabizi Dataset including 3K sentences, balanced, covering different topics, preprocessed and annotated as positive and negative.", "evaluation_metadata": {}}, "tuple_ie": {"name": "tuple_ie", "description": "The TupleInf Open IE dataset contains Open IE tuples extracted from 263K sentences that were used by the solver in \u201cAnswering Complex Questions Using Open Information Extraction\u201d (referred as Tuple KB, T). These sentences were collected from a large Web corpus using training questions from 4th and 8th grade as queries. This dataset contains 156K sentences collected for 4th grade questions and 107K sentences for 8th grade questions. Each sentence is followed by the Open IE v4 tuples using their simple format.", "evaluation_metadata": {}}, "turkic_xwmt": {"name": "turkic_xwmt", "description": "A Large-Scale Study of Machine Translation in Turkic Languages", "evaluation_metadata": {}}, "turkish_ner": {"name": "turkish_ner", "description": "Turkish Wikipedia Named-Entity Recognition and Text Categorization\n(TWNERTC) dataset is a collection of automatically categorized and annotated\nsentences obtained from Wikipedia. The authors constructed large-scale\ngazetteers by using a graph crawler algorithm to extract\nrelevant entity and domain information\nfrom a semantic knowledge base, Freebase.\nThe constructed gazetteers contains approximately\n300K entities with thousands of fine-grained entity types\nunder 77 different domains.", "evaluation_metadata": {}}, "turkish_product_reviews": {"name": "turkish_product_reviews", "description": "Turkish Product Reviews.\nThis repository contains 235.165 product reviews collected online. There are 220.284 positive, 14881 negative reviews.", "evaluation_metadata": {}}, "turku_ner_corpus": {"name": "turku_ner_corpus", "description": "An open, broad-coverage corpus for Finnish named entity recognition presented in Luoma et al. (2020) A Broad-coverage Corpus for Finnish Named Entity Recognition.", "evaluation_metadata": {}}, "tweet_eval": {"name": "tweet_eval", "description": "TweetEval consists of seven heterogenous tasks in Twitter, all framed as multi-class tweet classification. All tasks have been unified into the same benchmark, with each dataset presented in the same format and with fixed training, validation and test splits.", "evaluation_metadata": [{"config": "emotion", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}, {"config": "hate", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary", "args": {"average": "binary"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}, {"config": "irony", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary", "args": {"average": "binary"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}, {"config": "offensive", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary", "args": {"average": "binary"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}, {"config": "sentiment", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "tweet_qa": {"name": "tweet_qa", "description": "TweetQA is the first dataset for QA on social media data by leveraging news media and crowdsourcing.", "evaluation_metadata": {}}, "tweets_ar_en_parallel": {"name": "tweets_ar_en_parallel", "description": " Twitter users often post parallel tweets\u2014tweets that contain the same content but are\n written in different languages. Parallel tweets can be an important resource for developing\n machine translation (MT) systems among other natural language processing (NLP) tasks. This\n resource is a result of a generic method for collecting parallel tweets. Using the method,\n we compiled a bilingual corpus of English-Arabic parallel tweets and a list of Twitter accounts\n who post English-Arabic tweets regularly. Additionally, we annotate a subset of Twitter accounts\n with their countries of origin and topic of interest, which provides insights about the population\n who post parallel tweets.", "evaluation_metadata": {}}, "tweets_hate_speech_detection": {"name": "tweets_hate_speech_detection", "description": "The objective of this task is to detect hate speech in tweets. For the sake of simplicity, we say a tweet contains hate speech if it has a racist or sexist sentiment associated with it. So, the task is to classify racist or sexist tweets from other tweets.\n\nFormally, given a training sample of tweets and labels, where label \u20181\u2019 denotes the tweet is racist/sexist and label \u20180\u2019 denotes the tweet is not racist/sexist, your objective is to predict the labels on the given test dataset.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train"}, "col_mapping": {"tweet": "text", "label": "target", "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary", "args": {"average": "binary"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}}]}, "twi_text_c3": {"name": "twi_text_c3", "description": "Twi Text C3 is the largest Twi texts collected and used to train FastText embeddings in the\nYorubaTwi Embedding paper: https://www.aclweb.org/anthology/2020.lrec-1.335/", "evaluation_metadata": {}}, "twi_wordsim353": {"name": "twi_wordsim353", "description": "A translation of the word pair similarity dataset wordsim-353 to Twi.\n\nThe dataset was presented in the paper\nAlabi et al.: Massive vs. Curated Embeddings for Low-Resourced\nLanguages: the Case of Yor\u00f9b\u00e1 and Twi (LREC 2020).", "evaluation_metadata": {}}, "tydiqa": {"name": "tydiqa", "description": "TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.\nThe languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language\nexpresses -- such that we expect models performing well on this set to generalize across a large number of the languages\nin the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic\ninformation-seeking task and avoid priming effects, questions are written by people who want to know the answer, but\ndon\u2019t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language without\nthe use of translation (unlike MLQA and XQuAD).", "evaluation_metadata": {}}, "udhr": {"name": "udhr", "description": "The Universal Declaration of Human Rights (UDHR) is a milestone document in the history of human rights. Drafted by\nrepresentatives with different legal and cultural backgrounds from all regions of the world, it set out, for the\nfirst time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General\nAssembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the\ndocument in 464+ languages and dialects.\n\n\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights\n\nThis plain text version prepared by the \u201cUDHR in Unicode\u201d project, https://www.unicode.org/udhr.", "evaluation_metadata": {}}, "un_ga": {"name": "un_ga", "description": "United nations general assembly resolutions: A six-language parallel corpus.\nThis is a collection of translated documents from the United Nations originally compiled into a translation memory by Alexandre Rafalovitch, Robert Dale (see http://uncorpora.org).\n6 languages, 15 bitexts\ntotal number of files: 6\ntotal number of tokens: 18.87M\ntotal number of sentence fragments: 0.44M", "evaluation_metadata": {}}, "un_multi": {"name": "un_multi", "description": "This is a collection of translated documents from the United Nations. This corpus is available in all 6 official languages of the UN, consisting of around 300 million words per language", "evaluation_metadata": {}}, "un_pc": {"name": "un_pc", "description": "This parallel corpus consists of manually translated UN documents from the last 25 years (1990 to 2014) for the six official UN languages, Arabic, Chinese, English, French, Russian, and Spanish.", "evaluation_metadata": {}}, "universal_dependencies": {"name": "universal_dependencies", "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).", "evaluation_metadata": {}}, "universal_morphologies": {"name": "universal_morphologies", "description": "The Universal Morphology (UniMorph) project is a collaborative effort to improve how NLP handles complex morphology in the world\u2019s languages.\nThe goal of UniMorph is to annotate morphological data in a universal schema that allows an inflected word from any language to be defined by its lexical meaning,\ntypically carried by the lemma, and by a rendering of its inflectional form in terms of a bundle of morphological features from our schema.\nThe specification of the schema is described in Sylak-Glassman (2016).", "evaluation_metadata": {}}, "urdu_fake_news": {"name": "urdu_fake_news", "description": "Urdu fake news datasets that contain news of 5 different news domains.\nThese domains are Sports, Health, Technology, Entertainment, and Business.\nThe real news are collected by combining manual approaches.", "evaluation_metadata": {}}, "vivos": {"name": "vivos", "description": "\\\r\nVIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for\r\nVietnamese Automatic Speech Recognition task.\r\nThe corpus was prepared by AILAB, a computer science lab of VNUHCM - University of Science, with Prof. Vu Hai Quan is the head of.\r\nWe publish this corpus in hope to attract more scientists to solve Vietnamese speech recognition problems.", "evaluation_metadata": {}}, "web_nlg": {"name": "web_nlg", "description": "The WebNLG challenge consists in mapping data to text. The training data consists\nof Data/Text pairs where the data is a set of triples extracted from DBpedia and the text is a verbalisation\nof these triples. For instance, given the 3 DBpedia triples shown in (a), the aim is to generate a text such as (b).\n\na. (John_E_Blaha birthDate 1942_08_26) (John_E_Blaha birthPlace San_Antonio) (John_E_Blaha occupation Fighter_pilot)\nb. John E Blaha, born in San Antonio on 1942-08-26, worked as a fighter pilot\n\nAs the example illustrates, the task involves specific NLG subtasks such as sentence segmentation\n(how to chunk the input data into sentences), lexicalisation (of the DBpedia properties),\naggregation (how to avoid repetitions) and surface realisation\n(how to build a syntactically correct and natural sounding text).", "evaluation_metadata": {}}, "web_of_science": {"name": "web_of_science", "description": "The Web Of Science (WOS) dataset is a collection of data of published papers\navailable from the Web of Science. WOS has been released in three versions: WOS-46985, WOS-11967 and WOS-5736. WOS-46985 is the\nfull dataset. WOS-11967 and WOS-5736 are two subsets of WOS-46985.", "evaluation_metadata": {}}, "web_questions": {"name": "web_questions", "description": "This dataset consists of 6,642 question/answer pairs.\nThe questions are supposed to be answerable by Freebase, a large knowledge graph.\nThe questions are mostly centered around a single named entity.\nThe questions are popular ones asked on the web (at least in 2013).", "evaluation_metadata": {}}, "wi_locness": {"name": "wi_locness", "description": "Write & Improve (Yannakoudakis et al., 2018) is an online web platform that assists non-native\nEnglish students with their writing. Specifically, students from around the world submit letters,\nstories, articles and essays in response to various prompts, and the W&I system provides instant\nfeedback. Since W&I went live in 2014, W&I annotators have manually annotated some of these\nsubmissions and assigned them a CEFR level.", "evaluation_metadata": {}}, "wider_face": {"name": "wider_face", "description": "WIDER FACE dataset is a face detection benchmark dataset, of which images are\nselected from the publicly available WIDER dataset. We choose 32,203 images and\nlabel 393,703 faces with a high degree of variability in scale, pose and\nocclusion as depicted in the sample images. WIDER FACE dataset is organized\nbased on 61 event classes. For each event class, we randomly select 40%/10%/50%\ndata as training, validation and testing sets. We adopt the same evaluation\nmetric employed in the PASCAL VOC dataset. Similar to MALF and Caltech datasets,\nwe do not release bounding box ground truth for the test images. Users are\nrequired to submit final prediction files, which we shall proceed to evaluate.", "evaluation_metadata": {}}, "wiki40b": {"name": "wiki40b", "description": "Clean-up text for 40+ Wikipedia languages editions of pages\ncorrespond to entities. The datasets have train/dev/test splits per language.\nThe dataset is cleaned up by page filtering to remove disambiguation pages,\nredirect pages, deleted pages, and non-entity pages. Each example contains the\nwikidata id of the entity, and the full Wikipedia article after page processing\nthat removes non-content sections and structured objects.", "evaluation_metadata": {}}, "wiki_asp": {"name": "wiki_asp", "description": "WikiAsp is a multi-domain, aspect-based summarization dataset in the encyclopedic\ndomain. In this task, models are asked to summarize cited reference documents of a\nWikipedia article into aspect-based summaries. Each of the 20 domains include 10\ndomain-specific pre-defined aspects.", "evaluation_metadata": {}}, "wiki_atomic_edits": {"name": "wiki_atomic_edits", "description": "A dataset of atomic wikipedia edits containing insertions and deletions of a contiguous chunk of text in a sentence. This dataset contains ~43 million edits across 8 languages.\n\nAn atomic edit is defined as an edit e applied to a natural language expression S as the insertion, deletion, or substitution of a sub-expression P such that both the original expression S and the resulting expression e(S) are well-formed semantic constituents (MacCartney, 2009). In this corpus, we release such atomic insertions and deletions made to sentences in wikipedia.", "evaluation_metadata": {}}, "wiki_auto": {"name": "wiki_auto", "description": "WikiAuto provides a set of aligned sentences from English Wikipedia and Simple English Wikipedia\nas a resource to train sentence simplification systems. The authors first crowd-sourced a set of manual alignments\nbetween sentences in a subset of the Simple English Wikipedia and their corresponding versions in English Wikipedia\n(this corresponds to the `manual` config), then trained a neural CRF system to predict these alignments.\nThe trained model was then applied to the other articles in Simple English Wikipedia with an English counterpart to\ncreate a larger corpus of aligned sentences (corresponding to the `auto`, `auto_acl`, `auto_full_no_split`, and `auto_full_with_split` configs here).", "evaluation_metadata": {}}, "wiki_bio": {"name": "wiki_bio", "description": "This dataset gathers 728,321 biographies from wikipedia. It aims at evaluating text generation\nalgorithms. For each article, we provide the first paragraph and the infobox (both tokenized).\nFor each article, we extracted the first paragraph (text), the infobox (structured data). Each\ninfobox is encoded as a list of (field name, field value) pairs. We used Stanford CoreNLP\n(http://stanfordnlp.github.io/CoreNLP/) to preprocess the data, i.e. we broke the text into\nsentences and tokenized both the text and the field values. The dataset was randomly split in\nthree subsets train (80%), valid (10%), test (10%).", "evaluation_metadata": {}}, "wiki_dpr": {"name": "wiki_dpr", "description": "This is the wikipedia split used to evaluate the Dense Passage Retrieval (DPR) model.\nIt contains 21M passages from wikipedia along with their DPR embeddings.\nThe wikipedia articles were split into multiple, disjoint text blocks of 100 words as passages.", "evaluation_metadata": {}}, "wiki_hop": {"name": "wiki_hop", "description": "WikiHop is open-domain and based on Wikipedia articles; the goal is to recover Wikidata information by hopping through documents. The goal is to answer text understanding queries by combining multiple facts that are spread across different documents.", "evaluation_metadata": {}}, "wiki_lingua": {"name": "wiki_lingua", "description": "WikiLingua is a large-scale multilingual dataset for the evaluation of\ncross-lingual abstractive summarization systems. The dataset includes ~770k\narticle and summary pairs in 18 languages from WikiHow. The gold-standard\narticle-summary alignments across languages was done by aligning the images\nthat are used to describe each how-to step in an article.", "evaluation_metadata": {}}, "wiki_movies": {"name": "wiki_movies", "description": "The WikiMovies dataset consists of roughly 100k (templated) questions over 75k entities based on questions with answers in the open movie database (OMDb).", "evaluation_metadata": {}}, "wiki_qa": {"name": "wiki_qa", "description": "Wiki Question Answering corpus from Microsoft", "evaluation_metadata": {}}, "wiki_qa_ar": {"name": "wiki_qa_ar", "description": "Arabic Version of WikiQA by automatic automatic machine translators and crowdsourced the selection of the best one to be incorporated into the corpus", "evaluation_metadata": {}}, "wiki_source": {"name": "wiki_source", "description": "2 languages, total number of files: 132\ntotal number of tokens: 1.80M\ntotal number of sentence fragments: 78.36k", "evaluation_metadata": {}}, "wiki_split": {"name": "wiki_split", "description": "One million English sentences, each split into two sentences that together preserve the original meaning, extracted from Wikipedia\nGoogle's WikiSplit dataset was constructed automatically from the publicly available Wikipedia revision history. Although\nthe dataset contains some inherent noise, it can serve as valuable training data for models that split or merge sentences.", "evaluation_metadata": {}}, "wiki_summary": {"name": "wiki_summary", "description": "\\\r\nThe dataset extracted from Persian Wikipedia into the form of articles and highlights and cleaned the dataset into pairs of articles and highlights and reduced the articles' length (only version 1.0.0) and highlights' length to a maximum of 512 and 128, respectively, suitable for parsBERT.", "evaluation_metadata": {}}, "wikiann": {"name": "wikiann", "description": "WikiANN (sometimes called PAN-X) is a multilingual named entity recognition dataset consisting of Wikipedia articles annotated with LOC (location), PER (person), and ORG (organisation) tags in the IOB2 format. This version corresponds to the balanced train, dev, and test splits of Rahimi et al. (2019), which supports 176 of the 282 languages from the original WikiANN corpus.", "evaluation_metadata": {}}, "wikicorpus": {"name": "wikicorpus", "description": "The Wikicorpus is a trilingual corpus (Catalan, Spanish, English) that contains large portions of the Wikipedia (based on a 2006 dump) and has been automatically enriched with linguistic information. In its present version, it contains over 750 million words.", "evaluation_metadata": {}}, "wikipedia": {"name": "wikipedia", "description": "Wikipedia dataset containing cleaned articles of all languages.\nThe datasets are built from the Wikipedia dump\n(https://dumps.wikimedia.org/) with one split per language. Each example\ncontains the content of one full Wikipedia article with cleaning to strip\nmarkdown and unwanted sections (references, etc.).", "evaluation_metadata": {}}, "wikisql": {"name": "wikisql", "description": "A large crowd-sourced dataset for developing natural language interfaces for relational databases", "evaluation_metadata": {}}, "wikitext": {"name": "wikitext", "description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.", "evaluation_metadata": {}}, "wikitext_tl39": {"name": "wikitext_tl39", "description": "Large scale, unlabeled text dataset with 39 Million tokens in the training set. Inspired by the original WikiText Long Term Dependency dataset (Merity et al., 2016). TL means \"Tagalog.\" Originally published in Cruz & Cheng (2019).", "evaluation_metadata": {}}, "wili_2018": {"name": "wili_2018", "description": "It is a benchmark dataset for language identification and contains 235000 paragraphs of 235 languages", "evaluation_metadata": {}}, "wino_bias": {"name": "wino_bias", "description": "WinoBias, a Winograd-schema dataset for coreference resolution focused on gender bias.\nThe corpus contains Winograd-schema style sentences with entities corresponding to people\nreferred by their occupation (e.g. the nurse, the doctor, the carpenter).", "evaluation_metadata": {}}, "winograd_wsc": {"name": "winograd_wsc", "description": "A Winograd schema is a pair of sentences that differ in only one or two words and that contain an ambiguity that is\nresolved in opposite ways in the two sentences and requires the use of world knowledge and reasoning for its\nresolution. The schema takes its name from a well-known example by Terry Winograd:\n\n> The city councilmen refused the demonstrators a permit because they [feared/advocated] violence.\n\nIf the word is ``feared'', then ``they'' presumably refers to the city council; if it is ``advocated'' then ``they''\npresumably refers to the demonstrators.", "evaluation_metadata": {}}, "winogrande": {"name": "winogrande", "description": "WinoGrande is a new collection of 44k problems, inspired by Winograd Schema Challenge (Levesque, Davis, and Morgenstern\n 2011), but adjusted to improve the scale and robustness against the dataset-specific bias. Formulated as a\nfill-in-a-blank task with binary options, the goal is to choose the right option for a given sentence which requires\ncommonsense reasoning.", "evaluation_metadata": {}}, "wiqa": {"name": "wiqa", "description": "The WIQA dataset V1 has 39705 questions containing a perturbation and a possible effect in the context of a paragraph.\nThe dataset is split into 29808 train questions, 6894 dev questions and 3003 test questions.", "evaluation_metadata": {}}, "wisesight1000": {"name": "wisesight1000", "description": "`wisesight1000` contains Thai social media texts randomly drawn from the full `wisesight-sentiment`, tokenized by human annotators.\nOut of the labels `neg` (negative), `neu` (neutral), `pos` (positive), `q` (question), 250 samples each. Some texts are removed because\nthey look like spam.Because these samples are representative of real world content, we believe having these annotaed samples will allow\nthe community to robustly evaluate tokenization algorithms.", "evaluation_metadata": {}}, "wisesight_sentiment": {"name": "wisesight_sentiment", "description": "Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment category (positive, neutral, negative, question)\n* Released to public domain under Creative Commons Zero v1.0 Universal license.\n* Category (Labels): {\"pos\": 0, \"neu\": 1, \"neg\": 2, \"q\": 3}\n* Size: 26,737 messages\n* Language: Central Thai\n* Style: Informal and conversational. With some news headlines and advertisement.\n* Time period: Around 2016 to early 2019. With small amount from other period.\n* Domains: Mixed. Majority are consumer products and services (restaurants, cosmetics, drinks, car, hotels), with some current affairs.\n* Privacy:\n * Only messages that made available to the public on the internet (websites, blogs, social network sites).\n * For Facebook, this means the public comments (everyone can see) that made on a public page.\n * Private/protected messages and messages in groups, chat, and inbox are not included.\n* Alternations and modifications:\n * Keep in mind that this corpus does not statistically represent anything in the language register.\n * Large amount of messages are not in their original form. Personal data are removed or masked.\n * Duplicated, leading, and trailing whitespaces are removed. Other punctuations, symbols, and emojis are kept intact.\n (Mis)spellings are kept intact.\n * Messages longer than 2,000 characters are removed.\n * Long non-Thai messages are removed. Duplicated message (exact match) are removed.\n* More characteristics of the data can be explore: https://github.com/PyThaiNLP/wisesight-sentiment/blob/master/exploration.ipynb", "evaluation_metadata": [{"config": "wisesight_sentiment", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"texts": "text", "category": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "wmt20_mlqe_task1": {"name": "wmt20_mlqe_task1", "description": "This shared task (part of WMT20) will build on its previous editions\nto further examine automatic methods for estimating the quality\nof neural machine translation output at run-time, without relying\non reference translations. As in previous years, we cover estimation\nat various levels. Important elements introduced this year include: a new\ntask where sentences are annotated with Direct Assessment (DA)\nscores instead of labels based on post-editing; a new multilingual\nsentence-level dataset mainly from Wikipedia articles, where the\nsource articles can be retrieved for document-wide context; the\navailability of NMT models to explore system-internal information for the task.\n\nTask 1 uses Wikipedia data for 6 language pairs that includes high-resource\nEnglish--German (En-De) and English--Chinese (En-Zh), medium-resource\nRomanian--English (Ro-En) and Estonian--English (Et-En), and low-resource\nSinhalese--English (Si-En) and Nepalese--English (Ne-En), as well as a\ndataset with a combination of Wikipedia articles and Reddit articles\nfor Russian-English (En-Ru). The datasets were collected by translating\nsentences sampled from source language articles using state-of-the-art NMT\nmodels built using the fairseq toolkit and annotated with Direct Assessment (DA)\nscores by professional translators. Each sentence was annotated following the\nFLORES setup, which presents a form of DA, where at least three professional\ntranslators rate each sentence from 0-100 according to the perceived translation\nquality. DA scores are standardised using the z-score by rater. Participating systems\nare required to score sentences according to z-standardised DA scores.", "evaluation_metadata": {}}, "wmt20_mlqe_task2": {"name": "wmt20_mlqe_task2", "description": "This shared task (part of WMT20) will build on its previous editions\nto further examine automatic methods for estimating the quality\nof neural machine translation output at run-time, without relying\non reference translations. As in previous years, we cover estimation\nat various levels. Important elements introduced this year include: a new\ntask where sentences are annotated with Direct Assessment (DA)\nscores instead of labels based on post-editing; a new multilingual\nsentence-level dataset mainly from Wikipedia articles, where the\nsource articles can be retrieved for document-wide context; the\navailability of NMT models to explore system-internal information for the task.\n\nTask 2 evaluates the application of QE for post-editing purposes. It consists of predicting:\n- A/ Word-level tags. This is done both on source side (to detect which words caused errors)\nand target side (to detect mistranslated or missing words).\n - A1/ Each token is tagged as either `OK` or `BAD`. Additionally,\n each gap between two words is tagged as `BAD` if one or more\n missing words should have been there, and `OK` otherwise. Note\n that number of tags for each target sentence is 2*N+1, where\n N is the number of tokens in the sentence.\n - A2/ Tokens are tagged as `OK` if they were correctly\n translated, and `BAD` otherwise. Gaps are not tagged.\n- B/ Sentence-level HTER scores. HTER (Human Translation Error Rate)\nis the ratio between the number of edits (insertions/deletions/replacements)\nneeded and the reference translation length.", "evaluation_metadata": {}}, "wmt20_mlqe_task3": {"name": "wmt20_mlqe_task3", "description": "This shared task (part of WMT20) will build on its previous editions\nto further examine automatic methods for estimating the quality\nof neural machine translation output at run-time, without relying\non reference translations. As in previous years, we cover estimation\nat various levels. Important elements introduced this year include: a new\ntask where sentences are annotated with Direct Assessment (DA)\nscores instead of labels based on post-editing; a new multilingual\nsentence-level dataset mainly from Wikipedia articles, where the\nsource articles can be retrieved for document-wide context; the\navailability of NMT models to explore system-internal information for the task.\n\nThe goal of this task 3 is to predict document-level quality scores as well as fine-grained annotations.", "evaluation_metadata": {}}, "wongnai_reviews": {"name": "wongnai_reviews", "description": "Wongnai's review dataset contains restaurant reviews and ratings, mainly in Thai language.\nThe reviews are in 5 classes ranging from 1 to 5 stars.", "evaluation_metadata": {}}, "woz_dialogue": {"name": "woz_dialogue", "description": "Wizard-of-Oz (WOZ) is a dataset for training task-oriented dialogue systems. The dataset is designed around the task of finding a restaurant in the Cambridge, UK area. There are three informable slots (food, pricerange,area) that users can use to constrain the search and six requestable slots (address, phone, postcode plus the three informable slots) that the user can ask a value for once a restaurant has been offered.", "evaluation_metadata": {}}, "wrbsc": {"name": "wrbsc", "description": "WUT Relations Between Sentences Corpus contains 2827 pairs of related sentences.\nRelationships are derived from Cross-document Structure Theory (CST), which enables multi-document summarization through identification of cross-document rhetorical relationships within a cluster of related documents.\nEvery relation was marked by at least 3 annotators.", "evaluation_metadata": {}}, "x_stance": {"name": "x_stance", "description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions.\n\nIt can be used to train and evaluate stance detection systems.", "evaluation_metadata": {}}, "xcopa": {"name": "xcopa", "description": " XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning\nThe Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across\nlanguages. The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around\nthe globe. The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages. All the details about the\ncreation of XCOPA and the implementation of the baselines are available in the paper.\\n", "evaluation_metadata": {}}, "xcsr": {"name": "xcsr", "description": "To evaluate multi-lingual language models (ML-LMs) for commonsense reasoning in a cross-lingual zero-shot transfer setting (X-CSR), i.e., training in English and test in other languages, we create two benchmark datasets, namely X-CSQA and X-CODAH. Specifically, we automatically translate the original CSQA and CODAH datasets, which only have English versions, to 15 other languages, forming development and test sets for studying X-CSR. As our goal is to evaluate different ML-LMs in a unified evaluation protocol for X-CSR, we argue that such translated examples, although might contain noise, can serve as a starting benchmark for us to obtain meaningful analysis, before more human-translated datasets will be available in the future.", "evaluation_metadata": {}}, "xed_en_fi": {"name": "xed_en_fi", "description": "A multilingual fine-grained emotion dataset. The dataset consists of human annotated Finnish (25k) and English sentences (30k). Plutchik\u2019s\ncore emotions are used to annotate the dataset with the addition of neutral to create a multilabel multiclass\ndataset. The dataset is carefully evaluated using language-specific BERT models and SVMs to\nshow that XED performs on par with other similar datasets and is therefore a useful tool for\nsentiment analysis and emotion detection.", "evaluation_metadata": {}}, "xnli": {"name": "xnli", "description": "XNLI is a subset of a few thousand examples from MNLI which has been translated\ninto a 14 different languages (some low-ish resource). As with MNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).", "evaluation_metadata": {}}, "xor_tydi_qa": {"name": "xor_tydi_qa", "description": " XOR-TyDi QA brings together for the first time information-seeking questions,\n open-retrieval QA, and multilingual QA to create a multilingual open-retrieval\n QA dataset that enables cross-lingual answer retrieval. It consists of questions\n written by information-seeking native speakers in 7 typologically diverse languages\n and answer annotations that are retrieved from multilingual document collections.\n There are three sub-tasks: XOR-Retrieve, XOR-EnglishSpan, and XOR-Full.", "evaluation_metadata": {}}, "xquad": {"name": "xquad", "description": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering\nperformance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set\nof SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations into ten languages: Spanish, German,\nGreek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, Hindi and Romanian. Consequently, the dataset is entirely parallel\nacross 12 languages.", "evaluation_metadata": {}}, "xquad_r": {"name": "xquad_r", "description": "XQuAD-R is a retrieval version of the XQuAD dataset (a cross-lingual extractive QA dataset). Like XQuAD, XQUAD-R is an 11-way parallel dataset, where each question appears in 11 different languages and has 11 parallel correct answers across the languages.", "evaluation_metadata": {}}, "xsum": {"name": "xsum", "description": "Extreme Summarization (XSum) Dataset.\n\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.", "evaluation_metadata": [{"config": "default", "task": "summarization", "task_id": "summarization", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"document": "text", "summary": "target"}, "metrics": [{"type": "rouge", "name": "Rouge"}]}]}, "xsum_factuality": {"name": "xsum_factuality", "description": "Neural abstractive summarization models are highly prone to hallucinate content that is unfaithful to the input\ndocument. The popular metric such as ROUGE fails to show the severity of the problem. The dataset consists of\nfaithfulness and factuality annotations of abstractive summaries for the XSum dataset. We have crowdsourced 3 judgements\n for each of 500 x 5 document-system pairs. This will be a valuable resource to the abstractive summarization community.", "evaluation_metadata": {}}, "xtreme": {"name": "xtreme", "description": "The Cross-lingual TRansfer Evaluation of Multilingual Encoders (XTREME) benchmark is a benchmark for the evaluation of\nthe cross-lingual generalization ability of pre-trained multilingual models. It covers 40 typologically diverse languages\n(spanning 12 language families) and includes nine tasks that collectively require reasoning about different levels of\nsyntax and semantics. The languages in XTREME are selected to maximize language diversity, coverage in existing tasks,\nand availability of training data. Among these are many under-studied languages, such as the Dravidian languages Tamil\n(spoken in southern India, Sri Lanka, and Singapore), Telugu and Malayalam (spoken mainly in southern India), and the\nNiger-Congo languages Swahili and Yoruba, spoken in Africa.", "evaluation_metadata": {}}, "yahoo_answers_qa": {"name": "yahoo_answers_qa", "description": "Yahoo Non-Factoid Question Dataset is derived from Yahoo's Webscope L6 collection using machine learning techiques such that the questions would contain non-factoid answers.The dataset contains 87,361 questions and their corresponding answers. Each question contains its best answer along with additional other answers submitted by users. Only the best answer was reviewed in determining the quality of the question-answer pair.", "evaluation_metadata": {}}, "yahoo_answers_topics": {"name": "yahoo_answers_topics", "description": "Yahoo! Answers Topic Classification is text classification dataset. The dataset is the Yahoo! Answers corpus as of 10/25/2007. The Yahoo! Answers topic classification dataset is constructed using 10 largest main categories. From all the answers and other meta-information, this dataset only used the best answer content and the main category information.", "evaluation_metadata": [{"config": "yahoo_answers_topics", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"question_content": "text", "topic": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "yelp_polarity": {"name": "yelp_polarity", "description": "Large Yelp Review Dataset.\nThis is a dataset for binary sentiment classification. We provide a set of 560,000 highly polar yelp reviews for training, and 38,000 for testing. \nORIGIN\nThe Yelp reviews dataset consists of reviews from Yelp. It is extracted\nfrom the Yelp Dataset Challenge 2015 data. For more information, please\nrefer to http://www.yelp.com/dataset_challenge\n\nThe Yelp reviews polarity dataset is constructed by\nXiang Zhang (xiang.zhang@nyu.edu) from the above dataset.\nIt is first used as a text classification benchmark in the following paper:\nXiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks\nfor Text Classification. Advances in Neural Information Processing Systems 28\n(NIPS 2015).\n\n\nDESCRIPTION\n\nThe Yelp reviews polarity dataset is constructed by considering stars 1 and 2\nnegative, and 3 and 4 positive. For each polarity 280,000 training samples and\n19,000 testing samples are take randomly. In total there are 560,000 trainig\nsamples and 38,000 testing samples. Negative polarity is class 1,\nand positive class 2.\n\nThe files train.csv and test.csv contain all the training samples as\ncomma-sparated values. There are 2 columns in them, corresponding to class\nindex (1 and 2) and review text. The review texts are escaped using double\nquotes (\"), and any internal double quote is escaped by 2 double quotes (\"\").\nNew lines are escaped by a backslash followed with an \"n\" character,\nthat is \"\\n\".", "evaluation_metadata": [{"config": "plain_text", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary", "args": {"average": "binary"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "yelp_review_full": {"name": "yelp_review_full", "description": "The Yelp reviews dataset consists of reviews from Yelp. It is extracted from the Yelp Dataset Challenge 2015 data.\nThe Yelp reviews full star dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the above dataset.\nIt is first used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun.\nCharacter-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).", "evaluation_metadata": [{"config": "yelp_review_full", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "yoruba_bbc_topics": {"name": "yoruba_bbc_topics", "description": "A collection of news article headlines in Yoruba from BBC Yoruba.\nEach headline is labeled with one of the following classes: africa,\nentertainment, health, nigeria, politics, sport or world.\n\nThe dataset was presented in the paper:\nHedderich, Adelani, Zhu, Alabi, Markus, Klakow: Transfer Learning and\nDistant Supervision for Multilingual Transformer Models: A Study on\nAfrican Languages (EMNLP 2020).", "evaluation_metadata": {}}, "yoruba_gv_ner": {"name": "yoruba_gv_ner", "description": "The Yoruba GV NER dataset is a labeled dataset for named entity recognition in Yoruba. The texts were obtained from\nYoruba Global Voices News articles https://yo.globalvoices.org/ . We concentrate on\nfour types of named entities: persons [PER], locations [LOC], organizations [ORG], and dates & time [DATE].\n\nThe Yoruba GV NER data files contain 2 columns separated by a tab ('\\t'). Each word has been put on a separate line and\nthere is an empty line after each sentences i.e the CoNLL format. The first item on each line is a word, the second\nis the named entity tag. The named entity tags have the format I-TYPE which means that the word is inside a phrase\nof type TYPE. For every multi-word expression like 'New York', the first word gets a tag B-TYPE and the subsequent words\nhave tags I-TYPE, a word with tag O is not part of a phrase. The dataset is in the BIO tagging scheme.\n\nFor more details, see https://www.aclweb.org/anthology/2020.lrec-1.335/", "evaluation_metadata": {}}, "yoruba_text_c3": {"name": "yoruba_text_c3", "description": "Yoruba Text C3 is the largest Yoruba texts collected and used to train FastText embeddings in the\nYorubaTwi Embedding paper: https://www.aclweb.org/anthology/2020.lrec-1.335/", "evaluation_metadata": {}}, "yoruba_wordsim353": {"name": "yoruba_wordsim353", "description": "A translation of the word pair similarity dataset wordsim-353 to Yor\u00f9b\u00e1.\n\nThe dataset was presented in the paper\nAlabi et al.: Massive vs. Curated Embeddings for Low-Resourced\nLanguages: the Case of Yor\u00f9b\u00e1 and Twi (LREC 2020).", "evaluation_metadata": {}}, "youtube_caption_corrections": {"name": "youtube_caption_corrections", "description": "Dataset built from pairs of YouTube captions where both 'auto-generated' and\n'manually-corrected' captions are available for a single specified language.\nThis dataset labels two-way (e.g. ignoring single-sided insertions) same-length\ntoken differences in the `diff_type` column. The `default_seq` is composed of\ntokens from the 'auto-generated' captions. When a difference occurs between\nthe 'auto-generated' vs 'manually-corrected' captions types, the `correction_seq`\ncontains tokens from the 'manually-corrected' captions.", "evaluation_metadata": {}}, "zest": {"name": "zest", "description": "ZEST tests whether NLP systems can perform unseen tasks in a zero-shot way, given a natural language description of\nthe task. It is an instantiation of our proposed framework \"learning from task descriptions\". The tasks include\nclassification, typed entity extraction and relationship extraction, and each task is paired with 20 different\nannotated (input, output) examples. ZEST's structure allows us to systematically test whether models can generalize\nin five different ways.", "evaluation_metadata": {}}, "0n1xus/codexglue": {"name": "0n1xus/codexglue", "description": "CodeXGLUE is a benchmark dataset to foster machine learning research for program understanding and generation. \nCodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison.", "evaluation_metadata": {}}, "0n1xus/pytorrent-standalone": {"name": "0n1xus/pytorrent-standalone", "description": "pytorrent-standalone is a subset of the PyTorrent dataset, where only functions that does not depend on external libraries\nare kept.", "evaluation_metadata": {}}, "AlekseyKorshuk/comedy-scripts": {"name": "AlekseyKorshuk/comedy-scripts", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "AlekseyKorshuk/horror-scripts": {"name": "AlekseyKorshuk/horror-scripts", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "BSC-LT/SQAC": {"name": "BSC-LT/SQAC", "description": "This dataset contains 6,247 contexts and 18,817 questions with their answers, 1 to 5 for each fragment.\n\nThe sources of the contexts are:\n\n* Encyclopedic articles from [Wikipedia in Spanish](https://es.wikipedia.org/), used under [CC-by-sa licence](https://creativecommons.org/licenses/by-sa/3.0/legalcode). \n\n* News from [Wikinews in Spanish](https://es.wikinews.org/), used under [CC-by licence](https://creativecommons.org/licenses/by/2.5/). \n\n* Text from the Spanish corpus [AnCora](http://clic.ub.edu/corpus/en), which is a mix from diferent newswire and literature sources, used under [CC-by licence] (https://creativecommons.org/licenses/by/4.0/legalcode). \n\nThis dataset can be used to build extractive-QA.", "evaluation_metadata": {}}, "BSC-LT/sts-ca": {"name": "BSC-LT/sts-ca", "description": "Semantic Textual Similarity in Catalan.\n STS corpus is a benchmark for evaluating Semantic Text Similarity in Catalan.\n It consists of more than 3000 sentence pairs, annotated with the semantic similarity between them, \n using a scale from 0 (no similarity at all) to 5 (semantic equivalence). \n It is done manually by 4 different annotators following our guidelines based on previous work from the SemEval challenges (https://www.aclweb.org/anthology/S13-1004.pdf).\n The source data are scraped sentences from the Catalan Textual Corpus (https://doi.org/10.5281/zenodo.4519349), used under CC-by-SA-4.0 licence (https://creativecommons.org/licenses/by-sa/4.0/). The dataset is released under the same licence.\n This dataset was developed by BSC TeMU as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).\n This is the version 1.0.2 of the dataset with the complete human and automatic annotations and the analysis scripts. It also has a more accurate license.\n This dataset can be used to build and score semantic similiarity models.", "evaluation_metadata": {}}, "Babelscape/rebel-dataset": {"name": "Babelscape/rebel-dataset", "description": "REBEL is a silver dataset created for the paper REBEL: Relation Extraction By End-to-end Language generation", "evaluation_metadata": {}}, "Lacito/pangloss": {"name": "Lacito/pangloss", "description": "These datasets are extracts from the Pangloss collection and have\nbeen preprocessed for ASR experiments in Na and Japhug.", "evaluation_metadata": {}}, "CAiRE/ASCEND": {"name": "CAiRE/ASCEND", "description": "ASCEND (A Spontaneous Chinese-English Dataset) introduces a high-quality resource of spontaneous multi-turn conversational dialogue Chinese-English code-switching corpus collected in Hong Kong. ASCEND consists of 10.62 hours of spontaneous speech with a total of ~12.3K utterances. The corpus is split into 3 sets: training, validation, and test with a ratio of 8:1:1 while maintaining a balanced gender proportion on each set.", "evaluation_metadata": {}}, "CodedotAI/code_clippy": {"name": "CodedotAI/code_clippy", "description": "This dataset was generated by selecting GitHub repositories from a large collection of repositories. These repositories were collected from https://seart-ghs.si.usi.ch/ and Github portion of [The Pile](https://github.com/EleutherAI/github-downloader) (performed on July 7th, 2021). The goal of this dataset is to provide a training set for pretraining large language models on code data for helping software engineering researchers better understand their impacts on software related tasks such as autocompletion of code. The dataset is split into train, validation, and test splits. There is a version containing duplicates (209GBs compressed) and ones where exact duplicates (132GBs compressed) are removed. Contains mostly JavaScript and Python code, but other programming languages are included as well to various degrees.", "evaluation_metadata": {}}, "CodedotAI/code_clippy_github": {"name": "CodedotAI/code_clippy_github", "description": "The Code Clippy dataset consists of various public codebases from GitHub in 22 programming languages with 23 extensions totalling about 16 TB of data when uncompressed. The dataset was created from the public GitHub dataset on Google BiqQuery.", "evaluation_metadata": {}}, "EMBO/sd-nlp": {"name": "EMBO/sd-nlp", "description": " This dataset is based on the SourceData database and is intented to facilitate training of NLP tasks in the cell and molecualr biology domain.", "evaluation_metadata": {}}, "FRTNX/cosuju": {"name": "FRTNX/cosuju", "description": "Court Summaries and Judgements (CoSuJu) Dataset", "evaluation_metadata": {}}, "Felix-ML/quoteli3": {"name": "Felix-ML/quoteli3", "description": "This dataset is a representation of Muzny et al.'s QuoteLi3 dataset as a Huggingface dataset. It can be best used for \nquote attribution.", "evaluation_metadata": {}}, "Firoj/HumAID": {"name": "Firoj/HumAID", "description": "The HumAID Twitter dataset consists of several thousands of manually annotated tweets that has been collected during 19 major natural disaster events including earthquakes, hurricanes, wildfires, and floods, which happened from 2016 to 2019 across different parts of the World. The annotations in the provided datasets consists of following humanitarian categories. The dataset consists only english tweets and it is the largest dataset for crisis informatics so far.\n** Humanitarian categories **\n- Caution and advice\n- Displaced people and evacuations\n- Dont know cant judge\n- Infrastructure and utility damage\n- Injured or dead people\n- Missing or found people\n- Not humanitarian\n- Other relevant information\n- Requests or urgent needs\n- Rescue volunteering or donation effort\n- Sympathy and support", "evaluation_metadata": {}}, "Fraser/mnist-text-default": {"name": "Fraser/mnist-text-default", "description": "MNIST dataset adapted to a text-based representation.\n\nThis allows testing interpolation quality for Transformer-VAEs.\n\nSystem is heavily inspired by Matthew Rayfield's work https://youtu.be/Z9K3cwSL6uM\n\nWorks by quantising each MNIST pixel into one of 64 characters.\nEvery sample has an up & down version to encourage the model to learn rotation invarient features.\n\nUse `.array_to_text(` and `.text_to_array(` methods to test your generated data.\n\nData format:\n- text: (30 x 28 tokens, 840 tokens total): Textual representation of MNIST digit, for example:\n```\n00 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n01 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n02 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n03 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n04 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n05 down ! ! ! ! ! ! ! ! ! ! ! ! ! % % % @ C L ' J a ^ @ ! ! ! !\n06 down ! ! ! ! ! ! ! ! ( * 8 G K ` ` ` ` ` Y L ` ] Q 1 ! ! ! !\n07 down ! ! ! ! ! ! ! - \\ ` ` ` ` ` ` ` ` _ 8 5 5 / * ! ! ! ! !\n08 down ! ! ! ! ! ! ! % W ` ` ` ` ` R N ^ ] ! ! ! ! ! ! ! ! ! !\n09 down ! ! ! ! ! ! ! ! 5 H ; ` ` T # ! + G ! ! ! ! ! ! ! ! ! !\n10 down ! ! ! ! ! ! ! ! ! $ ! G ` 7 ! ! ! ! ! ! ! ! ! ! ! ! ! !\n11 down ! ! ! ! ! ! ! ! ! ! ! C ` P ! ! ! ! ! ! ! ! ! ! ! ! ! !\n12 down ! ! ! ! ! ! ! ! ! ! ! # P ` 2 ! ! ! ! ! ! ! ! ! ! ! ! !\n13 down ! ! ! ! ! ! ! ! ! ! ! ! ) ] Y I < ! ! ! ! ! ! ! ! ! ! !\n14 down ! ! ! ! ! ! ! ! ! ! ! ! ! 5 ] ` ` > ' ! ! ! ! ! ! ! ! !\n15 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! , O ` ` F ' ! ! ! ! ! ! ! !\n16 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! % 8 ` ` O ! ! ! ! ! ! ! !\n17 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! _ ` _ 1 ! ! ! ! ! ! !\n18 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! , A N ` ` T ! ! ! ! ! ! ! !\n19 down ! ! ! ! ! ! ! ! ! ! ! ! * F Z ` ` ` _ N ! ! ! ! ! ! ! !\n20 down ! ! ! ! ! ! ! ! ! ! ' = X ` ` ` ` S 4 ! ! ! ! ! ! ! ! !\n21 down ! ! ! ! ! ! ! ! & 1 V ` ` ` ` R 5 ! ! ! ! ! ! ! ! ! ! !\n22 down ! ! ! ! ! ! % K W ` ` ` ` Q 5 # ! ! ! ! ! ! ! ! ! ! ! !\n23 down ! ! ! ! . L Y ` ` ` ` ^ B # ! ! ! ! ! ! ! ! ! ! ! ! ! !\n24 down ! ! ! ! C ` ` ` V B B % ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n25 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n26 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n27 down ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !\n```\n- label: Just a number with the texts matching label.", "evaluation_metadata": {}}, "Fraser/mnist-text-no-spaces": {"name": "Fraser/mnist-text-no-spaces", "description": "MNIST dataset adapted to a text-based representation.\n\nThis allows testing interpolation quality for Transformer-VAEs.\n\nSystem is heavily inspired by Matthew Rayfield's work https://youtu.be/Z9K3cwSL6uM\n\nWorks by quantising each MNIST pixel into one of 64 characters.\nEvery sample has an up & down version to encourage the model to learn rotation invarient features.\n\nUse `.array_to_text(` and `.text_to_array(` methods to test your generated data.\n\nRemoved spaces to get better BPE compression on sequences.\n**Should only be used with a trained tokenizer.**\n\nData format:\n- text: (30 x 28 tokens, 840 tokens total): Textual representation of MNIST digit, for example:\n```\n00down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n01down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n02down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n03down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n04down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n05down!!!!!!!!!!!!!%%%@CL'Ja^@!!!!\n06down!!!!!!!!(*8GK`````YL`]Q1!!!!\n07down!!!!!!!-\\\\````````_855/*!!!!!\n08down!!!!!!!%W`````RN^]!!!!!!!!!!\n09down!!!!!!!!5H;``T#!+G!!!!!!!!!!\n10down!!!!!!!!!$!G`7!!!!!!!!!!!!!!\n11down!!!!!!!!!!!C`P!!!!!!!!!!!!!!\n12down!!!!!!!!!!!#P`2!!!!!!!!!!!!!\n13down!!!!!!!!!!!!)]YI'!!!!!!!!!\n15down!!!!!!!!!!!!!!,O``F'!!!!!!!!\n16down!!!!!!!!!!!!!!!%8``O!!!!!!!!\n17down!!!!!!!!!!!!!!!!!_`_1!!!!!!!\n18down!!!!!!!!!!!!!!,AN``T!!!!!!!!\n19down!!!!!!!!!!!!*FZ```_N!!!!!!!!\n20down!!!!!!!!!!'=X````S4!!!!!!!!!\n21down!!!!!!!!&1V````R5!!!!!!!!!!!\n22down!!!!!!%KW````Q5#!!!!!!!!!!!!\n23down!!!!.LY````^B#!!!!!!!!!!!!!!\n24down!!!!C```VBB%!!!!!!!!!!!!!!!!\n25down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n26down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n27down!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n```\n- label: Just a number with the texts matching label.", "evaluation_metadata": {}}, "Fraser/mnist-text-small": {"name": "Fraser/mnist-text-small", "description": "MNIST dataset adapted to a text-based representation.\n\n*Modified images to be ~1/4 the original area.*\nDone by taking a max pool.\n\nThis allows testing interpolation quality for Transformer-VAEs.\n\nSystem is heavily inspired by Matthew Rayfield's work https://youtu.be/Z9K3cwSL6uM\n\nWorks by quantising each MNIST pixel into one of 64 characters.\nEvery sample has an up & down version to encourage the model to learn rotation invarient features.\n\nUse `.array_to_text(` and `.text_to_array(` methods to test your generated data.\n\nData format:\n- text: (16 x 14 tokens, 224 tokens total): Textual representation of MNIST digit, for example:\n```\n00 down ! ! ! ! ! ! ! ! ! ! ! ! ! !\n01 down ! ! ! ! ! ! ! ! ! ! ! ! ! !\n02 down ! ! ! ! ! ! % % C L a ^ ! !\n03 down ! ! ! - ` ` ` ` ` Y ` Q ! !\n04 down ! ! ! % ` ` ` R ^ ! ! ! ! !\n05 down ! ! ! ! $ G ` ! ! ! ! ! ! !\n06 down ! ! ! ! ! # ` Y < ! ! ! ! !\n07 down ! ! ! ! ! ! 5 ` ` F ! ! ! !\n08 down ! ! ! ! ! ! ! % ` ` 1 ! ! !\n09 down ! ! ! ! ! ! F ` ` ` ! ! ! !\n10 down ! ! ! ! 1 ` ` ` ` 4 ! ! ! !\n11 down ! ! L ` ` ` ` 5 ! ! ! ! ! !\n12 down ! ! ` ` V B ! ! ! ! ! ! ! !\n13 down ! ! ! ! ! ! ! ! ! ! ! ! ! !\n```\n- label: Just a number with the texts matching label.", "evaluation_metadata": {}}, "Fraser/python-lines": {"name": "Fraser/python-lines", "description": "Dataset of single lines of Python code taken from the [CodeSearchNet](https://github.com/github/CodeSearchNet) dataset.\n\nContext\n\nThis dataset allows checking the validity of Variational-Autoencoder latent spaces by testing what percentage of random/intermediate latent points can be greedily decoded into valid Python code.\n\nContent\n\nEach row has a parsable line of source code.\n{'text': '{python source code line}'}\n\nMost lines are < 100 characters while all are under 125 characters.\n\nContains 2.6 million lines.\n\nAll code is in parsable into a python3 ast.", "evaluation_metadata": {}}, "Fraser/python-state-changes": {"name": "Fraser/python-state-changes", "description": "Python state changes from a single line of code.", "evaluation_metadata": {}}, "Fraser/short-jokes": {"name": "Fraser/short-jokes", "description": "Copy of [Kaggle dataset](https://www.kaggle.com/abhinavmoudgil95/short-jokes), adding to Huggingface for ease of use.\n\nDescription from Kaggle:\n\nContext\n\nGenerating humor is a complex task in the domain of machine learning, and it requires the models to understand the deep semantic meaning of a joke in order to generate new ones. Such problems, however, are difficult to solve due to a number of reasons, one of which is the lack of a database that gives an elaborate list of jokes. Thus, a large corpus of over 0.2 million jokes has been collected by scraping several websites containing funny and short jokes.\n\nVisit my Github repository for more information regarding collection of data and the scripts used.\n\nContent\n\nThis dataset is in the form of a csv file containing 231,657 jokes. Length of jokes ranges from 10 to 200 characters. Each line in the file contains a unique ID and joke.\n\nDisclaimer\n\nIt has been attempted to keep the jokes as clean as possible. Since the data has been collected by scraping websites, it is possible that there may be a few jokes that are inappropriate or offensive to some people.", "evaluation_metadata": {}}, "GEM/ART": {"name": "GEM/ART", "description": "the Abductive Natural Language Generation Dataset from AI2", "evaluation_metadata": {}}, "GEM/BiSECT": {"name": "GEM/BiSECT", "description": "BiSECT is a Split and Rephrase corpus created via bilingual pivoting.", "evaluation_metadata": {}}, "GEM/OrangeSum": {"name": "GEM/OrangeSum", "description": "The OrangeSum dataset was inspired by the XSum dataset. It was created by scraping the \"Orange Actu\" website: https://actu.orange.fr/. Orange S.A. is a large French multinational telecommunications corporation, with 266M customers worldwide. Scraped pages cover almost a decade from Feb 2011 to Sep 2020. They belong to five main categories: France, world, politics, automotive, and society. The society category is itself divided into 8 subcategories: health, environment, people, culture, media, high-tech, unsual (\"insolite\" in French), and miscellaneous.\n\nEach article featured a single-sentence title as well as a very brief abstract, both professionally written by the author of the article. These two fields were extracted from each page, thus creating two summarization tasks: OrangeSum Title and OrangeSum Abstract.", "evaluation_metadata": {}}, "GEM/RiSAWOZ": {"name": "GEM/RiSAWOZ", "description": "RiSAWOZ contains 11.2K human-to-human (H2H) multiturn semantically annotated dialogues, with more than 150K utterances spanning over 12 domains, which is larger than all previous annotated H2H conversational datasets.Both single- and multi-domain dialogues are constructed, accounting for 65% and 35%, respectively.", "evaluation_metadata": {}}, "GEM/RotoWire_English-German": {"name": "GEM/RotoWire_English-German", "description": "Dataset for the WNGT 2019 DGT shared task on \"Document-Level Generation and Translation\u201d.", "evaluation_metadata": {}}, "GEM/SIMPITIKI": {"name": "GEM/SIMPITIKI", "description": "SIMPITIKI is a Simplification corpus for Italian and it consists of two sets of simplified pairs: the first one is harvested from the Italian Wikipedia in a semi-automatic way; the second one is manually annotated sentence-by-sentence from documents in the administrative domain.", "evaluation_metadata": {}}, "GEM/SciDuet": {"name": "GEM/SciDuet", "description": "SciDuet is the first publicaly available dataset for the challenging task of document2slides generation,\nThe dataset integrated into GEM is the ACL portion of the whole dataset described in \"https://aclanthology.org/2021.naacl-main.111.pdf\".\nIt contains the full Dev and Test sets, and a portion of the Train dataset. \nWe additionally create a challenge dataset in which the slide titles do not match with the \nsection headers of the corresponding paper.\nNote that although we cannot release the whole training dataset due to copyright issues, researchers can still \nuse our released data procurement code from https://github.com/IBM/document2slides\nto generate the training dataset from the online ICML/NeurIPS anthologies. \nIn the released dataset, the original papers and slides (both are in PDF format) are carefully processed by a combination of PDF/Image processing tookits.\nThe text contents from multiple slides that correspond to the same slide title are mreged.", "evaluation_metadata": {}}, "GEM/Taskmaster": {"name": "GEM/Taskmaster", "description": "The Taskmaster-3 (aka TicketTalk) dataset consists of 23,789 movie ticketing dialogs\n(located in Taskmaster/TM-3-2020/data/). By \"movie ticketing\" we mean conversations\nwhere the customer's goal is to purchase tickets after deciding on theater, time,\nmovie name, number of tickets, and date, or opt out of the transaction.\nThe columns are gem_id, 0, 1 for serial numbering, 2 for the text dialog and id\nfor the default id by the authors.", "evaluation_metadata": {}}, "GEM/cochrane-simplification": {"name": "GEM/cochrane-simplification", "description": "This dataset measures the ability for a model to simplify paragraphs of medical text through the omission non-salient information and simplification of medical jargon.", "evaluation_metadata": {}}, "GEM/common_gen": {"name": "GEM/common_gen", "description": "CommonGen is a constrained text generation task, associated with a benchmark\ndataset, to explicitly test machines for the ability of generative commonsense\nreasoning. Given a set of common concepts; the task is to generate a coherent\nsentence describing an everyday scenario using these concepts.", "evaluation_metadata": {}}, "GEM/conversational_weather": {"name": "GEM/conversational_weather", "description": "The Conversational Weather dataset is designed for generation of responses to weather queries based on a structured input data. The input allows specifying data attributes such as dates, times, locations, weather conditions, and errors, and also offers control over structure of response through discourse relations such as join, contrast, and justification.", "evaluation_metadata": {}}, "GEM/cs_restaurants": {"name": "GEM/cs_restaurants", "description": "The task is generating responses in the context of a (hypothetical) dialogue\nsystem that provides information about restaurants. The input is a basic\nintent/dialogue act type and a list of slots (attributes) and their values.\nThe output is a natural language sentence.", "evaluation_metadata": {}}, "GEM/dart": {"name": "GEM/dart", "description": "DART is a large and open-domain structured DAta Record to Text generation corpus\nwith high-quality sentence annotations with each input being a set of\nentity-relation triples following a tree-structured ontology. It consists of\n82191 examples across different domains with each input being a semantic RDF\ntriple set derived from data records in tables and the tree ontology of table\nschema, annotated with sentence description that covers all facts in the triple set.", "evaluation_metadata": {}}, "GEM/e2e_nlg": {"name": "GEM/e2e_nlg", "description": "The E2E dataset is designed for a limited-domain data-to-text task --\ngeneration of restaurant descriptions/recommendations based on up to 8 different\nattributes (name, area, price range etc.).", "evaluation_metadata": {}}, "GEM/indonlg": {"name": "GEM/indonlg", "description": "The IndoNLG benchmark is a collection of resources for training, evaluating, and analyzing natural language generation systems for Indonesian, Javanese, and Sundanese.", "evaluation_metadata": {}}, "GEM/mlb_data_to_text": {"name": "GEM/mlb_data_to_text", "description": "The MLB dataset for data to text generation contains Major League Baseball games statistics and \ntheir human-written summaries.", "evaluation_metadata": {}}, "GEM/mlsum": {"name": "GEM/mlsum", "description": "This is the MLSUM subset of the GEM benchmark. MLSUM is the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.", "evaluation_metadata": {}}, "GEM/opusparcus": {"name": "GEM/opusparcus", "description": "Opusparcus is a paraphrase corpus for six European languages: German,\nEnglish, Finnish, French, Russian, and Swedish. The paraphrases are\nextracted from the OpenSubtitles2016 corpus, which contains subtitles\nfrom movies and TV shows.", "evaluation_metadata": {}}, "GEM/schema_guided_dialog": {"name": "GEM/schema_guided_dialog", "description": "The Schema-Guided Dialogue (SGD) dataset contains 18K multi-domain task-oriented\ndialogues between a human and a virtual assistant, which covers 17 domains\nranging from banks and events to media, calendar, travel, and weather. The\nlanguage presents in the datset is only English. The SGD dataset provides a\nchallenging testbed for a number of tasks in task-oriented dialogue, including\nlanguage understanding, slot filling, dialogue state tracking and response\ngeneration. For the creation of the SGD dataset, they developed a multi-domain\ndialogue simulator that generates dialogue outlines over an arbitrary combination\nof APIs, dialogue states and system actions. Then, they used a crowd-sourcing\nprocedure to paraphrase these outlines to natural language utterances. This novel\ncrowd-sourcing procedure preserves all annotations obtained from the simulator and\ndoes not require any extra annotations after dialogue collection.", "evaluation_metadata": {}}, "GEM/sportsett_basketball": {"name": "GEM/sportsett_basketball", "description": "SportSett:Basketball dataset for Data-to-Text Generation contains NBA games stats aligned with their human written summaries.", "evaluation_metadata": {}}, "GEM/squad_v2": {"name": "GEM/squad_v2", "description": " SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers\n to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but\n also determine when no answer is supported by the paragraph and abstain from answering.", "evaluation_metadata": {}}, "GEM/totto": {"name": "GEM/totto", "description": "ToTTo is an open-domain English table-to-text dataset with over 120,000 training examples that proposes a controlled generation task: given a Wikipedia table and a set of highlighted table cells, produce a one-sentence description.", "evaluation_metadata": {}}, "GEM/turku_hockey_data2text": {"name": "GEM/turku_hockey_data2text", "description": "The Turku Hockey Data2Text corpus was developed as a benchmark for evaluating template-free, machine learning methods on Finnish news generation in the area of ice hockey reporting. This dataset is a collection of 3,454 ice hockey games, each including game statistics and a news article describing the game. Each game includes manual alignment of events (such as goals or penalties) and sentences describing the specific event in natural language extracted from the news article. The corpus includes 12,827 annotated events. The natural language passages are manually curated not to include any information not derivable from the input data or world knowledge.", "evaluation_metadata": {}}, "GEM/turku_paraphrase_corpus": {"name": "GEM/turku_paraphrase_corpus", "description": "Turku Paraphrase Corpus is a dataset of 104,645 manually annotated Finnish paraphrases. The vast majority of the data is classified as a paraphrase either in the given context, or universally.", "evaluation_metadata": {}}, "GEM/viggo": {"name": "GEM/viggo", "description": "ViGGO was designed for the task of data-to-text generation in chatbots (as opposed to task-oriented dialogue systems), with target responses being more conversational than information-seeking, yet constrained to the information presented in a meaning representation. The dataset, being relatively small and clean, can also serve for demonstrating transfer learning capabilities of neural models.", "evaluation_metadata": {}}, "GEM/web_nlg": {"name": "GEM/web_nlg", "description": "WebNLG is a bi-lingual dataset (English, Russian) of parallel DBpedia triple sets\nand short texts that cover about 450 different DBpedia properties. The WebNLG data\nwas originally created to promote the development of RDF verbalisers able to\ngenerate short text and to handle micro-planning (i.e., sentence segmentation and\nordering, referring expression generation, aggregation); the goal of the task is\nto generate texts starting from 1 to 7 input triples which have entities in common\n(so the input is actually a connected Knowledge Graph). The dataset contains about\n17,000 triple sets and 45,000 crowdsourced texts in English, and 7,000 triples sets\nand 19,000 crowdsourced texts in Russian. A challenging test set section with\nentities and/or properties that have not been seen at training time is available.", "evaluation_metadata": {}}, "GEM/wiki_cat_sum": {"name": "GEM/wiki_cat_sum", "description": "Summarise the most important facts of a given entity in the Film, Company, and Animal domains from a cluster of related documents.", "evaluation_metadata": {}}, "GEM/wiki_lingua": {"name": "GEM/wiki_lingua", "description": "WikiLingua is a large-scale multilingual dataset for the evaluation of\ncrosslingual abstractive summarization systems. The dataset includes ~770k\narticle and summary pairs in 18 languages from WikiHow. The gold-standard\narticle-summary alignments across languages was done by aligning the images\nthat are used to describe each how-to step in an article.", "evaluation_metadata": {}}, "GEM/xlsum": {"name": "GEM/xlsum", "description": "We present XLSum, a comprehensive and diverse dataset comprising 1.35 million professionally\nannotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics.\nThe dataset covers 45 languages ranging from low to high-resource, for many of which no\npublic dataset is currently available. XL-Sum is highly abstractive, concise,\nand of high quality, as indicated by human and intrinsic evaluation.", "evaluation_metadata": {}}, "Graphcore/gqa-lxmert": {"name": "Graphcore/gqa-lxmert", "description": "GQA is a new dataset for real-world visual reasoning and compositional question answering,\nseeking to address key shortcomings of previous visual question answering (VQA) datasets.", "evaluation_metadata": {}}, "Graphcore/gqa": {"name": "Graphcore/gqa", "description": "GQA is a new dataset for real-world visual reasoning and compositional question answering,\nseeking to address key shortcomings of previous visual question answering (VQA) datasets.", "evaluation_metadata": {}}, "Graphcore/vqa": {"name": "Graphcore/vqa", "description": "VQA is a new dataset containing open-ended questions about images. \nThese questions require an understanding of vision, language and commonsense knowledge to answer.", "evaluation_metadata": {}}, "GroNLP/ik-nlp-22_slp": {"name": "GroNLP/ik-nlp-22_slp", "description": "Paragraphs from the Speech and Language Processing book (3ed) by Jurafsky and Martin extracted semi-automatically\nfrom Chapters 2 to 11 of the original book draft.", "evaluation_metadata": {}}, "GroNLP/ik-nlp-22_transqe": {"name": "GroNLP/ik-nlp-22_transqe", "description": "The e-SNLI dataset extends the Stanford Natural Language Inference Dataset to\ninclude human-annotated natural language explanations of the entailment\nrelations. This version includes an automatic translation to Dutch and two quality estimation annotations\nfor each translated field.", "evaluation_metadata": {}}, "Helsinki-NLP/tatoeba_mt": {"name": "Helsinki-NLP/tatoeba_mt", "description": "The Tatoeba Translation Challenge is a multilingual data set of\nmachine translation benchmarks derived from user-contributed\ntranslations collected by [Tatoeba.org](https://tatoeba.org/) and\nprovided as parallel corpus from [OPUS](https://opus.nlpl.eu/). This\ndataset includes test and development data sorted by language pair. It\nincludes test sets for hundreds of language pairs and is continuously\nupdated. Please, check the version number tag to refer to the release\nthat your are using.", "evaluation_metadata": {}}, "KBLab/sucx3_ner": {"name": "KBLab/sucx3_ner", "description": " The dataset is a conversion of the venerable SUC 3.0 dataset into the\n huggingface ecosystem. The original dataset does not contain an official\n train-dev-test split, which is introduced here; the tag distribution for the\n NER tags between the three splits is mostly the same.\n \n The dataset has three different types of tagsets: manually annotated POS,\n manually annotated NER, and automatically annotated NER. For the\n automatically annotated NER tags, only sentences were chosen, where the\n automatic and manual annotations would match (with their respective\n categories).\n \n Additionally we provide remixes of the same data with some or all sentences\n being lowercased.", "evaluation_metadata": {}}, "Kili/plastic_in_river": {"name": "Kili/plastic_in_river", "description": "This dataset contains photos of rivers on which there may be waste. The waste items are annotated\n through bounding boxes, and are assigned to one of the 4 following categories: plastic bottle, plastic bag,\n another plastic waste, or non-plastic waste. Note that some photos may not contain any waste.", "evaluation_metadata": {}}, "LeverageX/klue-mrc": {"name": "LeverageX/klue-mrc", "description": "Klue Machine Reading Comprehension Data", "evaluation_metadata": {}}, "Narsil/asr_dummy": {"name": "Narsil/asr_dummy", "description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .flac format and is not converted to a float32 array. To\nconvert, the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "NbAiLab/NCC": {"name": "NbAiLab/NCC", "description": "\\\\nNorwegian Colossal Corpus v2. Short sequences of maximum 100k characters.", "evaluation_metadata": {}}, "NbAiLab/NPSC": {"name": "NbAiLab/NPSC", "description": "The Norwegian Parliament Speech Corpus (NPSC) is a corpus for training a Norwegian ASR (Automatic Speech Recognition) models. The corpus is created by Spr\u00e5kbanken at the National Library in Norway.\n\nNPSC is based on sound recording from meeting in the Norwegian Parliament. These talks are orthographically transcribed to either Norwegian Bokm\u00e5l or Norwegian Nynorsk. In addition to the data actually included in this dataset, there is a significant amount of metadata that is included in the original corpus. Through the speaker id there is additional information about the speaker, like gender, age, and place of birth (ie dialect). Through the proceedings id the corpus can be linked to the official proceedings from the meetings.\n\nThe corpus is in total sound recordings from 40 entire days of meetings. This amounts to 140 hours of speech, 65,000 sentences or 1.2 million words.", "evaluation_metadata": {}}, "NbAiLab/norne": {"name": "NbAiLab/norne", "description": "NorNE is a manually annotated\ncorpus of named entities which extends the annotation of the existing\nNorwegian Dependency Treebank. Comprising both of the official standards of\nwritten Norwegian (Bokm\u00e5l and Nynorsk), the corpus contains around 600,000\ntokens and annotates a rich set of entity types including persons,\norganizations, locations, geo-political entities, products, and events,\nin addition to a class corresponding to nominals derived from names.", "evaluation_metadata": {}}, "NbAiLab/norwegian_parliament": {"name": "NbAiLab/norwegian_parliament", "description": "The Norwegian Parliament Speeches is a collection of text passages from\n1998 to 2016 and pronounced at the Norwegian Parliament (Storting) by members\nof the two major parties: Fremskrittspartiet and Sosialistisk Venstreparti.", "evaluation_metadata": {}}, "PlanTL-GOB-ES/SQAC": {"name": "PlanTL-GOB-ES/SQAC", "description": "This dataset contains 6,247 contexts and 18,817 questions with their answers, 1 to 5 for each fragment.\n\nThe sources of the contexts are:\n\n* Encyclopedic articles from [Wikipedia in Spanish](https://es.wikipedia.org/), used under [CC-by-sa licence](https://creativecommons.org/licenses/by-sa/3.0/legalcode). \n\n* News from [Wikinews in Spanish](https://es.wikinews.org/), used under [CC-by licence](https://creativecommons.org/licenses/by/2.5/). \n\n* Text from the Spanish corpus [AnCora](http://clic.ub.edu/corpus/en), which is a mix from diferent newswire and literature sources, used under [CC-by licence] (https://creativecommons.org/licenses/by/4.0/legalcode). \n\nThis dataset can be used to build extractive-QA.", "evaluation_metadata": {}}, "PlanTL-GOB-ES/pharmaconer": {"name": "PlanTL-GOB-ES/pharmaconer", "description": "PharmaCoNER: Pharmacological Substances, Compounds and Proteins Named Entity Recognition track\n\nThis dataset is designed for the PharmaCoNER task, sponsored by Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).\n\nIt is a manually classified collection of clinical case studies derived from the Spanish Clinical Case Corpus (SPACCC), an\nopen access electronic library that gathers Spanish medical publications from SciELO (Scientific Electronic Library Online).\n\nThe annotation of the entire set of entity mentions was carried out by medicinal chemistry experts\nand it includes the following 4 entity types: NORMALIZABLES, NO_NORMALIZABLES, PROTEINAS and UNCLEAR.\n\nThe PharmaCoNER corpus contains a total of 396,988 words and 1,000 clinical cases that have been randomly sampled into 3 subsets.\nThe training set contains 500 clinical cases, while the development and test sets contain 250 clinical cases each.\nIn terms of training examples, this translates to a total of 8074, 3764 and 3931 annotated sentences in each set.\nThe original dataset was distributed in Brat format (https://brat.nlplab.org/standoff.html).\n\nFor further information, please visit https://temu.bsc.es/pharmaconer/ or send an email to encargo-pln-life@bsc.es", "evaluation_metadata": {}}, "RohanAiLab/persian_blog": {"name": "RohanAiLab/persian_blog", "description": "persian_blog is a dataset consist of 400K blog posts from various websites and has types of tones.\nthis dataset can be used in different NLG tasks and as a show-case it's is used in training reformer-persian.", "evaluation_metadata": {}}, "RohanAiLab/persian_daily_news": {"name": "RohanAiLab/persian_daily_news", "description": "Persian Daily News dataset is a collection of 2 million news articles with the headline of each news article.\nThis dataset contains news articles and their summaries for the last 10 years.\nThis dataset is provided by Rohan AI lab for research purposes.", "evaluation_metadata": {}}, "RohanAiLab/persian_news_dataset": {"name": "RohanAiLab/persian_news_dataset", "description": "persian_news_dataset is a collection of 5 million news articles. \nNews articles have been gathered from more than 10 news agencies for the last 12 years. \nThe dataset is provided by Rohan AI lab for research purposes.\nfor more information refer to this link:", "evaluation_metadata": {}}, "RollingMuffin/test_scripts": {"name": "RollingMuffin/test_scripts", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "SajjadAyoubi/persian_qa": {"name": "SajjadAyoubi/persian_qa", "description": "\\\\\\\\\\\\\\Persian Question Answering (PersianQA) Dataset is a reading comprehension dataset on Persian Wikipedia. \nThe crowd-sourced dataset consists of more than 9,000 entries. Each entry can be either an impossible to answer or a question with one or more answers spanning in the passage (the context) from which the questioner proposed the question. Much like the SQuAD2.0 dataset, the impossible or unanswerable questions can be utilized to create a system which \"knows that it doesn't know the answer\".", "evaluation_metadata": {}}, "Sam2021/Arguement_Mining_CL2017": {"name": "Sam2021/Arguement_Mining_CL2017", "description": "tokens along with chunk id. IOB1 format Begining of arguement denoted by B-ARG,inside arguement\ndenoted by I-ARG, other chunks are O\nOrginial train,test split as used by the paper is provided", "evaluation_metadata": {}}, "SetFit/amazon_counterfactual": {"name": "SetFit/amazon_counterfactual", "description": "The dataset contains sentences from Amazon customer reviews (sampled from Amazon product review dataset) annotated for counterfactual detection (CFD) binary classification. Counterfactual statements describe events that did not or cannot take place. Counterfactual statements may be identified as statements of the form \u2013 If p was true, then q would be true (i.e. assertions whose antecedent (p) and consequent (q) are known or assumed to be false).", "evaluation_metadata": {}}, "SetFit/ethos": {"name": "SetFit/ethos", "description": "ETHOS: onlinE haTe speecH detectiOn dataSet. This repository contains a dataset for hate speech\ndetection on social media platforms, called Ethos. There are two variations of the dataset:\n\nEthos_Dataset_Binary: contains 998 comments in the dataset alongside with a label\nabout hate speech presence or absence. 565 of them do not contain hate speech,\nwhile the rest of them, 433, contain.\n\nEthos_Dataset_Multi_Label: which contains 8 labels for the 433 comments with hate speech content.\nThese labels are violence (if it incites (1) or not (0) violence), directed_vs_general (if it is\ndirected to a person (1) or a group (0)), and 6 labels about the category of hate speech like,\ngender, race, national_origin, disability, religion and sexual_orientation.", "evaluation_metadata": {}}, "SoLID/shellcode_i_a32": {"name": "SoLID/shellcode_i_a32", "description": "Shellcode_IA32 is a dataset for shellcode generation from English intents. The shellcodes are compilable on Intel Architecture 32-bits.", "evaluation_metadata": {}}, "SocialGrep/one-year-of-r-india": {"name": "SocialGrep/one-year-of-r-india", "description": "This corpus contains the complete data for the activity of the subreddit /r/India from Sep 30, 2020 to Sep 30, 2021.", "evaluation_metadata": {}}, "SocialGrep/reddit-crypto-aug-2021": {"name": "SocialGrep/reddit-crypto-aug-2021", "description": "This corpus contains the complete data for the activity on seven major cryptocurrency subreddits for the entire month of August 2021.", "evaluation_metadata": {}}, "SocialGrep/reddit-nonewnormal-complete": {"name": "SocialGrep/reddit-nonewnormal-complete", "description": "This corpus contains the complete data for the activity on subreddit /r/NoNewNormal for the entire duration of its existence.", "evaluation_metadata": {}}, "SocialGrep/reddit-wallstreetbets-aug-2021": {"name": "SocialGrep/reddit-wallstreetbets-aug-2021", "description": "This corpus contains the complete data for the activity on /r/WallStreetBets for the entire month of August 2021.", "evaluation_metadata": {}}, "SocialGrep/ten-million-reddit-answers": {"name": "SocialGrep/ten-million-reddit-answers", "description": "A spiritual successor to our One Million Questions, this NLP dataset contains an outstanding ten million of /r/AskReddit answers, going back from the end of November of 2020.", "evaluation_metadata": {}}, "SocialGrep/the-reddit-covid-dataset": {"name": "SocialGrep/the-reddit-covid-dataset", "description": "This dataset attempts to capture the full extent of COVID-19 discussion across the entire site of Reddit. All posts and comments found to mention the term 'COVID' as of 2021-10-25 have been gathered from the site.", "evaluation_metadata": {}}, "SocialGrep/top-american-universities-on-reddit": {"name": "SocialGrep/top-american-universities-on-reddit", "description": "This NLP dataset contains all the posts and comments in the subreddits of top 10 universities in the United States, chosen according to the 2019 Forbes ranking.", "evaluation_metadata": {}}, "TRoboto/names": {"name": "TRoboto/names", "description": "List of Arabic first names with meaning and origin of most names", "evaluation_metadata": {}}, "TurkuNLP/turku_hockey_data2text": {"name": "TurkuNLP/turku_hockey_data2text", "description": "The Turku Hockey Data2Text corpus was developed as a benchmark for evaluating template-free, machine learning methods on Finnish news generation in the area of ice hockey reporting. This dataset is a collection of 3,454 ice hockey games, each including game statistics and a news article describing the game. Each game includes manual alignment of events (such as goals or penalties) and sentences describing the specific event in natural language extracted from the news article. The corpus includes 12,827 annotated events. The natural language passages are manually curated not to include any information not derivable from the input data or world knowledge.", "evaluation_metadata": {}}, "TurkuNLP/turku_paraphrase_corpus": {"name": "TurkuNLP/turku_paraphrase_corpus", "description": "Turku Paraphrase Corpus is a dataset of 104,645 manually annotated Finnish paraphrases. The vast majority of the data is classified as a paraphrase either in the given context, or universally.", "evaluation_metadata": {}}, "Zaid/coqa_expanded": {"name": "Zaid/coqa_expanded", "description": "\\\\nCoQA: A Conversational Question Answering Challenge", "evaluation_metadata": {}}, "Zaid/quac_expanded": {"name": "Zaid/quac_expanded", "description": "\\\\nQuestion Answering in Context is a dataset for modeling, understanding,\nand participating in information seeking dialog. Data instances consist\nof an interactive dialog between two crowd workers: (1) a student who\nposes a sequence of freeform questions to learn as much as possible\nabout a hidden Wikipedia text, and (2) a teacher who answers the questions\nby providing short excerpts (spans) from the text. QuAC introduces\nchallenges not found in existing machine comprehension datasets: its\nquestions are often more open-ended, unanswerable, or only meaningful\nwithin the dialog context.", "evaluation_metadata": {}}, "ai4bharat/samanantar": {"name": "ai4bharat/samanantar", "description": "Samanantar is the largest publicly available parallel corpora collection for Indic languages: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Oriya, Punjabi, Tamil, Telugu. The corpus has 49.6M sentence pairs between English to Indian Languages.", "evaluation_metadata": {}}, "albertvillanova/legal_contracts": {"name": "albertvillanova/legal_contracts", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "allenai/scico": {"name": "allenai/scico", "description": " SciCo is a dataset for hierarchical cross-document coreference resolution\n over scientific papers in the CS domain.", "evaluation_metadata": {}}, "anton-l/superb_demo": {"name": "anton-l/superb_demo", "description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.\n\nNote that in order to limit the required storage for preparing this dataset, the\naudio is stored in the .flac format and is not converted to a float32 array. To\nconvert, the audio file to a float32 array, please make use of the `.map()`\nfunction as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "anton-l/superb_dummy": {"name": "anton-l/superb_dummy", "description": "Self-supervised learning (SSL) has proven vital for advancing research in\nnatural language processing (NLP) and computer vision (CV). The paradigm\npretrains a shared model on large volumes of unlabeled data and achieves\nstate-of-the-art (SOTA) for various tasks with minimal adaptation. However, the\nspeech processing community lacks a similar setup to systematically explore the\nparadigm. To bridge this gap, we introduce Speech processing Universal\nPERformance Benchmark (SUPERB). SUPERB is a leaderboard to benchmark the\nperformance of a shared model across a wide range of speech processing tasks\nwith minimal architecture changes and labeled data. Among multiple usages of the\nshared model, we especially focus on extracting the representation learned from\nSSL due to its preferable re-usability. We present a simple framework to solve\nSUPERB tasks by learning task-specialized lightweight prediction heads on top of\nthe frozen shared model. Our results demonstrate that the framework is promising\nas SSL representations show competitive generalizability and accessibility\nacross SUPERB tasks. We release SUPERB as a challenge with a leaderboard and a\nbenchmark toolkit to fuel the research in representation learning and general\nspeech processing.", "evaluation_metadata": {}}, "lmqg/qg_jaquad": {"name": "lmqg/qg_jaquad", "description": "[JaQuAD](https://github.com/SkelterLabsInc/JaQuAD) dataset for question generation (QG) task. The test set of the original \ndata is not publicly released, so we randomly sampled test questions from the training set.", "evaluation_metadata": {}}, "lmqg/qg_squad": {"name": "lmqg/qg_squad", "description": "[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) evaluation set for the question generation (QG) models. The split \nof test and development set follows the [\"Neural Question Generation\"](https://arxiv.org/abs/1705.00106) work and is \ncompatible with the [leader board](https://paperswithcode.com/sota/question-generation-on-squad11).", "evaluation_metadata": {}}, "ashraq/dhivehi-corpus": {"name": "ashraq/dhivehi-corpus", "description": "This is a dataset put together to pretrain a language model in Dhivehi, the language of Maldives.", "evaluation_metadata": {}}, "asi/wikitext_fr": {"name": "asi/wikitext_fr", "description": "Wikitext-fr language modeling dataset consists of over 70 million tokens \nextracted from the set of french Wikipedia articles that are classified as \n\"quality articles\" or \"good articles.\". The aim is to replicate the English \nbenchmark.", "evaluation_metadata": {}}, "astrideducation/cefr-combined-no-cefr-test": {"name": "astrideducation/cefr-combined-no-cefr-test", "description": "This dataset contains 3370555 sentences, which each have an assigned CEFR level derived from EFLLex (https://cental.uclouvain.be/cefrlex/efllex/download).\n The sentences comes from \"the pile books3\", which is available on Huggingface (https://huggingface.co/datasets/the_pile_books3).\n The CEFR levels used are A1, A2, B1, B2 and C1, and there are equals number of sentences for each level.\n Assigning each sentence a CEFR level followed is based on the concept of \"shifted frequency distribution\", introduced by David Alfter and his paper can be found at (https://gupea.ub.gu.se/bitstream/2077/66861/4/gupea_2077_66861_4.pdf).\n For each word in each sentence, take the CEFR level with the highest \"shifted frequency distribution\" in the EFLLex table. \n After all words have been processed, the sentence gets annotated with the most frequently appearing CEFR level from the whole senctence.", "evaluation_metadata": {}}, "bavard/personachat_truecased": {"name": "bavard/personachat_truecased", "description": "A version of the PersonaChat dataset that has been true-cased, and also has been given more normalized punctuation.\nThe original PersonaChat dataset is in all lower case, and has extra space around each clause/sentence separating\npunctuation mark. This version of the dataset has more of a natural language look, with sentence capitalization,\nproper noun capitalization, and normalized whitespace. Also, each dialogue turn includes a pool of distractor\ncandidate responses, which can be used by a multiple choice regularization loss during training.", "evaluation_metadata": {}}, "bertin-project/mc4-es-sampled": {"name": "bertin-project/mc4-es-sampled", "description": "50 million documents in Spanish extracted from mC4 applying perplexity sampling via mc4-sampling: \"https://huggingface.co/datasets/bertin-project/mc4-sampling\". Please, refer to BERTIN Project. The original dataset is the Multlingual Colossal, Cleaned version of Common Crawl's web crawl corpus (mC4), based on the Common Crawl dataset: \"https://commoncrawl.org\", and processed by AllenAI.", "evaluation_metadata": {}}, "bigscience/P3": {"name": "bigscience/P3", "description": "P3 (Public Pool of Prompts) is a collection of prompted English datasets covering a diverse set of NLP tasks. A prompt is the combination of an input template and a target template. The templates are functions mapping a data example into natural language for the input and target sequences. For example, in the case of an NLI dataset, the data example would include fields for *Premise, Hypothesis, Label*. An input template would be *If {Premise} is true, is it also true that {Hypothesis}?*, whereas a target template can be defined with the label choices *Choices[label]*. Here *Choices* is prompt-specific metadata that consists of the options *yes, maybe, no* corresponding to *label* being entailment (0), neutral (1) or contradiction (2).\n\nPrompts are collected using [Promptsource](https://github.com/bigscience-workshop/promptsource), an interface to interactively write prompts on datasets, and collect prompt-specific metadata such as evaluation metrics. As of October 13th, there are 2'000 prompts collected for 270+ data(sub)sets. The collection of prompts of P3 is publicly available on [Promptsource](https://github.com/bigscience-workshop/promptsource).\n\nTo train [T0*](https://huggingface.co/bigscience/T0pp), we used a subset of the prompts available in Promptsource (see details [here](https://huggingface.co/bigscience/T0pp#training-data)). However, some of the prompts use `random.choice`, a method that selects uniformly at random an option in a list of valid possibilities. For reproducibility purposes, we release the collection of prompted examples used to train T0*. **The data available here are the materialized version of the prompted datasets used in [Multitask Prompted Training Enables Zero-Shot Task Generalization](https://arxiv.org/abs/2110.08207) which represent only a subset of the datasets for which there is at least one prompt in Promptsource.**", "evaluation_metadata": {}}, "biu-nlp/qa_align": {"name": "biu-nlp/qa_align", "description": "This dataset contains QA-Alignments - annotations of cross-text content overlap. \nThe task input is two sentences from two documents, roughly talking about the same event, along with their QA-SRL annotations \nwhich capture verbal predicate-argument relations in question-answer format. The output is a cross-sentence alignment between sets of QAs which denote the same information. \nSee the paper for details: QA-Align: Representing Cross-Text Content Overlap by Aligning Question-Answer Propositions, Brook Weiss et. al., EMNLP 2021.\nHere we provide both the QASRL annotations and the QA-Align annotations for the target sentences.", "evaluation_metadata": {}}, "biu-nlp/qa_srl2018": {"name": "biu-nlp/qa_srl2018", "description": "The dataset contains question-answer pairs to model verbal predicate-argument structure. The questions start with wh-words (Who, What, Where, What, etc.) and contain a verb predicate in the sentence; the answers are phrases in the sentence.\nThis dataset, a.k.a \"QASRL Bank\", \"QASRL-v2\" or \"QASRL-LS\" (Large Scale), was constructed via crowdsourcing.", "evaluation_metadata": {}}, "biu-nlp/qa_srl2020": {"name": "biu-nlp/qa_srl2020", "description": "The dataset contains question-answer pairs to model verbal predicate-argument structure. \nThe questions start with wh-words (Who, What, Where, What, etc.) and contain a verb predicate in the sentence; the answers are phrases in the sentence.\nThis dataset, a.k.a \"QASRL-GS\" (Gold Standard) or \"QASRL-2020\", was constructed via controlled crowdsourcing.\nSee the paper for details: Controlled Crowdsourcing for High-Quality QA-SRL Annotation, Roit et. al., 2020", "evaluation_metadata": {}}, "biu-nlp/qanom": {"name": "biu-nlp/qanom", "description": "The dataset contains question-answer pairs to model predicate-argument structure of deverbal nominalizations. \nThe questions start with wh-words (Who, What, Where, What, etc.) and contain a the verbal form of a nominalization from the sentence; \nthe answers are phrases in the sentence. \nSee the paper for details: QANom: Question-Answer driven SRL for Nominalizations (Klein et. al., COLING 2020)\nFor previewing the QANom data along with the verbal annotations of QASRL, check out \"https://browse.qasrl.org/\". \nThis dataset was annotated by selected workers from Amazon Mechanical Turk.", "evaluation_metadata": {}}, "blinoff/medical_qa_ru_data": {"name": "blinoff/medical_qa_ru_data", "description": "This dataset contains 190,335 Russian Q&A posts from a medical related forum.", "evaluation_metadata": {}}, "cakiki/args_me": {"name": "cakiki/args_me", "description": "The args.me corpus (version 1.0, cleaned) comprises 382 545 arguments crawled from four debate portals in the middle of 2019. The debate portals are Debatewise, IDebate.org, Debatepedia, and Debate.org. The arguments are extracted using heuristics that are designed for each debate portal.", "evaluation_metadata": {}}, "cassandra-themis/QR-AN": {"name": "cassandra-themis/QR-AN", "description": " QR-AN Dataset: a classification dataset on french Parliament debates\n This is a dataset for theme/topic classification, made of questions and answers from https://www2.assemblee-nationale.fr/recherche/resultats_questions.\n It contains 188 unbalanced classes, 80k questions-answers divided into 3 splits: train (60k), val (10k) and test (10k).", "evaluation_metadata": {}}, "castorini/afriberta-corpus": {"name": "castorini/afriberta-corpus", "description": "Corpus used for training AfriBERTa models", "evaluation_metadata": {}}, "cbrew475/hwu66": {"name": "cbrew475/hwu66", "description": "This project contains natural language data for human-robot interaction in a projecthome domain which \nXingkun Liu et al, from Heriot-Watt University, collected and annotated. It can be used for evaluating \nNLU services/platforms.", "evaluation_metadata": {}}, "ccdv/arxiv-classification": {"name": "ccdv/arxiv-classification", "description": "Arxiv Classification Dataset: a classification of Arxiv Papers (11 classes).\n It contains 11 slightly unbalanced classes, 33k Arxiv Papers divided into 3 splits: train (23k), val (5k) and test (5k).\n Copied from \"Long Document Classification From Local Word Glimpses via Recurrent Attention Learning\" by JUN HE LIQUN WANG LIU LIU, JIAO FENG AND HAO WU\n See: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8675939\n See: https://github.com/LiqunW/Long-document-dataset", "evaluation_metadata": {}}, "ccdv/arxiv-summarization": {"name": "ccdv/arxiv-summarization", "description": "Arxiv dataset for summarization.\n From paper: A Discourse-Aware Attention Model for Abstractive Summarization of Long Documents\" by A. Cohan et al.\n See: https://aclanthology.org/N18-2097.pdf \n See: https://github.com/armancohan/long-summarization", "evaluation_metadata": [{"config": "document", "task": "summarization", "task_id": "summarization", "splits": {"eval_split": "test"}, "col_mapping": {"article": "text", "abstract": "target"}}]}, "ccdv/govreport-summarization": {"name": "ccdv/govreport-summarization", "description": "GovReport dataset for summarization.\n From paper: Efficient Attentions for Long Document Summarization\" by L. Huang et al.\n See: https://arxiv.org/pdf/2104.02112.pdf \n See: https://github.com/luyang-huang96/LongDocSum", "evaluation_metadata": {}}, "ccdv/patent-classification": {"name": "ccdv/patent-classification", "description": "Patent Classification Dataset: a classification of Patents (9 classes).\n It contains 9 unbalanced classes, 35k Patents and summaries divided into 3 splits: train (25k), val (5k) and test (5k).\n Data are sampled from \"BIGPATENT: A Large-Scale Dataset for Abstractive and Coherent Summarization.\" by Eva Sharma, Chen Li and Lu Wang \n See: https://aclanthology.org/P19-1212.pdf \n See: https://evasharma.github.io/bigpatent/", "evaluation_metadata": {}}, "ccdv/pubmed-summarization": {"name": "ccdv/pubmed-summarization", "description": "PubMed dataset for summarization.\n From paper: A Discourse-Aware Attention Model for Abstractive Summarization of Long Documents\" by A. Cohan et al.\n See: https://aclanthology.org/N18-2097.pdf \n See: https://github.com/armancohan/long-summarization", "evaluation_metadata": {}}, "cdleong/piglatin-mt": {"name": "cdleong/piglatin-mt", "description": "\\\\r\\nPig-latin machine and English parallel machine translation corpus. \r\n\r\nBased on \r\nThe Project Gutenberg EBook of \"De Bello Gallico\" and Other Commentaries\r\nhttps://www.gutenberg.org/ebooks/10657\r\n\r\nConverted to pig-latin with https://github.com/bpabel/piglatin", "evaluation_metadata": {}}, "cgarciae/point-cloud-mnist": {"name": "cgarciae/point-cloud-mnist", "description": "The MNIST dataset consists of 70,000 28x28 black-and-white points in 10 classes (one for each digits), with 7,000\npoints per class. There are 60,000 training points and 10,000 test points.", "evaluation_metadata": {}}, "clarin-pl/2021-punctuation-restoration": {"name": "clarin-pl/2021-punctuation-restoration", "description": "This dataset is designed to be used in training models\nthat restore punctuation marks from the output of \nAutomatic Speech Recognition system for Polish language.", "evaluation_metadata": {}}, "clarin-pl/aspectemo": {"name": "clarin-pl/aspectemo", "description": "AspectEmo dataset: Multi-Domain Corpus of Consumer Reviews for Aspect-Based \n Sentiment Analysis", "evaluation_metadata": {}}, "clarin-pl/polemo2-official": {"name": "clarin-pl/polemo2-official", "description": "PolEmo 2.0: Corpus of Multi-Domain Consumer Reviews, evaluation data for article presented at CoNLL.", "evaluation_metadata": {}}, "classla/FRENK-hate-en": {"name": "classla/FRENK-hate-en", "description": "The FRENK Datasets of Socially Unacceptable Discourse in English.", "evaluation_metadata": {}}, "classla/FRENK-hate-hr": {"name": "classla/FRENK-hate-hr", "description": "The FRENK Datasets of Socially Unacceptable Discourse in Croatian.", "evaluation_metadata": {}}, "classla/FRENK-hate-sl": {"name": "classla/FRENK-hate-sl", "description": "The FRENK Datasets of Socially Unacceptable Discourse in Slovene.", "evaluation_metadata": {}}, "classla/copa_hr": {"name": "classla/copa_hr", "description": "The COPA-HR dataset (Choice of plausible alternatives in Croatian) is a translation \nof the English COPA dataset (https://people.ict.usc.edu/~gordon/copa.html) by following the \nXCOPA dataset translation methodology (https://arxiv.org/abs/2005.00333). The dataset consists of 1000 premises \n(My body cast a shadow over the grass), each given a question (What is the cause?), and two choices \n(The sun was rising; The grass was cut), with a label encoding which of the choices is more plausible \ngiven the annotator or translator (The sun was rising).\n\nThe dataset is split into 400 training samples, 100 validation samples, and 500 test samples. It includes the \nfollowing features: 'premise', 'choice1', 'choice2', 'label', 'question', 'changed' (boolean).", "evaluation_metadata": {}}, "classla/hr500k": {"name": "classla/hr500k", "description": "The hr500k training corpus contains about 500,000 tokens manually annotated on the levels of \ntokenisation, sentence segmentation, morphosyntactic tagging, lemmatisation and named entities. \n\nOn the sentence level, the dataset contains 20159 training samples, 1963 validation samples and 2672 test samples \nacross the respective data splits. Each sample represents a sentence and includes the following features:\nsentence ID ('sent_id'), sentence text ('text'), list of tokens ('tokens'), list of lemmas ('lemmas'), \nlist of Multext-East tags ('xpos_tags), list of UPOS tags ('upos_tags'),\nlist of morphological features ('feats'), and list of IOB tags ('iob_tags'). The 'upos_tags' and 'iob_tags' features\nare encoded as class labels.", "evaluation_metadata": {}}, "classla/janes_tag": {"name": "classla/janes_tag", "description": "The dataset contains 6273 training samples, 762 validation samples and 749 test samples. \nEach sample represents a sentence and includes the following features: sentence ID ('sent_id'), \nlist of tokens ('tokens'), list of normalised word forms ('norms'), list of lemmas ('lemmas'), \nlist of Multext-East tags ('xpos_tags), list of morphological features ('feats'), \nand list of UPOS tags ('upos_tags'), which are encoded as class labels.", "evaluation_metadata": {}}, "classla/reldi_hr": {"name": "classla/reldi_hr", "description": "The dataset contains 6339 training samples, 815 validation samples and 785 test samples. \nEach sample represents a sentence and includes the following features: sentence ID ('sent_id'), \nlist of tokens ('tokens'), list of lemmas ('lemmas'), list of UPOS tags ('upos_tags'), \nlist of Multext-East tags ('xpos_tags), list of morphological features ('feats'), \nand list of IOB tags ('iob_tags'), which are encoded as class labels.", "evaluation_metadata": {}}, "classla/reldi_sr": {"name": "classla/reldi_sr", "description": "The dataset contains 5462 training samples, 711 validation samples and 725 test samples. \nEach sample represents a sentence and includes the following features: sentence ID ('sent_id'), \nlist of tokens ('tokens'), list of lemmas ('lemmas'), list of UPOS tags ('upos_tags'), \nlist of Multext-East tags ('xpos_tags), list of morphological features ('feats'), \nand list of IOB tags ('iob_tags'), which are encoded as class labels.", "evaluation_metadata": {}}, "classla/setimes_sr": {"name": "classla/setimes_sr", "description": "SETimes_sr is a Serbian dataset annotated for morphosyntactic information and named entities.\n\nThe dataset contains 3177 training samples, 395 validation samples and 319 test samples \nacross the respective data splits. Each sample represents a sentence and includes the following features:\nsentence ID ('sent_id'), sentence text ('text'), list of tokens ('tokens'), list of lemmas ('lemmas'), \nlist of Multext-East tags ('xpos_tags), list of UPOS tags ('upos_tags'),\nlist of morphological features ('feats'), and list of IOB tags ('iob_tags'). The 'upos_tags' and 'iob_tags' features\nare encoded as class labels.", "evaluation_metadata": {}}, "classla/ssj500k": {"name": "classla/ssj500k", "description": "The dataset contains 7432 training samples, 1164 validation samples and 893 test samples. \nEach sample represents a sentence and includes the following features: sentence ID ('sent_id'), \nlist of tokens ('tokens'), list of lemmas ('lemmas'), \nlist of Multext-East tags ('xpos_tags), list of UPOS tags ('upos_tags'), list of morphological features ('feats'), \nlist of IOB tags ('iob_tags'), and list of universal dependency tags ('uds'). Three dataset configurations are\navailable, where the corresponding features are encoded as class labels: 'ner', 'upos', and 'ud'.", "evaluation_metadata": {}}, "clips/mfaq": {"name": "clips/mfaq", "description": "We present the first multilingual FAQ dataset publicly available. We collected around 6M FAQ pairs from the web, in 21 different languages.", "evaluation_metadata": {}}, "clips/mqa": {"name": "clips/mqa", "description": "MQA is a multilingual corpus of questions and answers parsed from the Common Crawl. Questions are divided between Frequently Asked Questions (FAQ) pages and Community Question Answering (CQA) pages.", "evaluation_metadata": {}}, "coastalcph/fairlex": {"name": "coastalcph/fairlex", "description": "Fairlex: A multilingual benchmark for evaluating fairness in legal text processing.", "evaluation_metadata": {}}, "collectivat/tv3_parla": {"name": "collectivat/tv3_parla", "description": "This corpus includes 240 hours of Catalan speech from broadcast material.\nThe details of segmentation, data processing and also model training are explained in K\u00fclebi, \u00d6ktem; 2018.\nThe content is owned by Corporaci\u00f3 Catalana de Mitjans Audiovisuals, SA (CCMA);\nwe processed their material and hereby making it available under their terms of use.\n\nThis project was supported by the Softcatal\u00e0 Association.", "evaluation_metadata": {}}, "corypaik/coda": {"name": "corypaik/coda", "description": "*The Color Dataset* (CoDa) is a probing dataset to evaluate the representation of visual properties in language models. CoDa consists of color distributions for 521 common objects, which are split into 3 groups: Single, Multi, and Any.", "evaluation_metadata": {}}, "corypaik/prost": {"name": "corypaik/prost", "description": "*Physical Reasoning about Objects Through Space and Time* (PROST) is a probing dataset to evaluate the ability of pretrained LMs to understand and reason about the physical world. PROST consists of 18,736 cloze-style multiple choice questions from 14 manually curated templates, covering 10 physical reasoning concepts: direction, mass, height, circumference, stackable, rollable, graspable, breakable, slideable, and bounceable.", "evaluation_metadata": {}}, "craffel/openai_lambada": {"name": "craffel/openai_lambada", "description": "LAMBADA dataset variant used by OpenAI to evaluate GPT-2 and GPT-3.", "evaluation_metadata": {}}, "csebuetnlp/xlsum": {"name": "csebuetnlp/xlsum", "description": "We present XLSum, a comprehensive and diverse dataset comprising 1.35 million professionally \nannotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics.\nThe dataset covers 45 languages ranging from low to high-resource, for many of which no\npublic dataset is currently available. XL-Sum is highly abstractive, concise, \nand of high quality, as indicated by human and intrinsic evaluation.", "evaluation_metadata": {}}, "csebuetnlp/xnli_bn": {"name": "csebuetnlp/xnli_bn", "description": "This is a Natural Language Inference (NLI) dataset for Bengali, curated using the subset of\nMNLI data used in XNLI and state-of-the-art English to Bengali translation model.", "evaluation_metadata": {}}, "ctu-aic/csfever": {"name": "ctu-aic/csfever", "description": "CsFEVER is a Czech localisation of the English FEVER datgaset.", "evaluation_metadata": {}}, "ctu-aic/csfever_nli": {"name": "ctu-aic/csfever_nli", "description": "CsfeverNLI is a NLI version of the Czech Csfever dataset", "evaluation_metadata": {}}, "ctu-aic/ctkfacts_nli": {"name": "ctu-aic/ctkfacts_nli", "description": "CtkFactsNLI is a NLI version of the Czech CTKFacts dataset", "evaluation_metadata": {}}, "dalle-mini/YFCC100M_OpenAI_subset": {"name": "dalle-mini/YFCC100M_OpenAI_subset", "description": "The YFCC100M is one of the largest publicly and freely useable multimedia collection, containing the metadata of around 99.2 million photos and 0.8 million videos from Flickr, all of which were shared under one of the various Creative Commons licenses.\n\nThis version is a subset defined in openai/CLIP.", "evaluation_metadata": {}}, "dataset/wikipedia_bn": {"name": "dataset/wikipedia_bn", "description": "Bengali Wikipedia from the dump of 03/20/2021.\nThe data was processed using the huggingface datasets wikipedia script early april 2021.\nThe dataset was built from the Wikipedia dump (https://dumps.wikimedia.org/).\nEach example contains the content of one full Wikipedia article with cleaning to strip\nmarkdown and unwanted sections (references, etc.).", "evaluation_metadata": {}}, "deepset/germandpr": {"name": "deepset/germandpr", "description": "We take GermanQuAD as a starting point and add hard negatives from a dump of the full German Wikipedia following the approach of the DPR authors (Karpukhin et al., 2020). The format of the dataset also resembles the one of DPR. GermanDPR comprises 9275 question/answer pairs in the training set and 1025 pairs in the test set. For each pair, there are one positive context and three hard negative contexts.", "evaluation_metadata": {}}, "deepset/germanquad": {"name": "deepset/germanquad", "description": "In order to raise the bar for non-English QA, we are releasing a high-quality, human-labeled German QA dataset consisting of 13 722 questions, incl. a three-way annotated test set.\nThe creation of GermanQuAD is inspired by insights from existing datasets as well as our labeling experience from several industry projects. We combine the strengths of SQuAD, such as high out-of-domain performance, with self-sufficient questions that contain all relevant information for open-domain QA as in the NaturalQuestions dataset. Our training and test datasets do not overlap like other popular datasets and include complex questions that cannot be answered with a single entity or only a few words.", "evaluation_metadata": [{"config": "plain_text", "task": "question-answering", "task_id": "extractive_question_answering", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"context": "context", "question": "question", "answers.text": "answers.text", "answers.answer_start": "answers.answer_start"}}]}, "DFKI-SLT/few-nerd": {"name": "DFKI-SLT/few-nerd", "description": "Few-NERD is a large-scale, fine-grained manually annotated named entity recognition dataset, \nwhich contains 8 coarse-grained types, 66 fine-grained types, 188,200 sentences, 491,711 entities \nand 4,601,223 tokens. Three benchmark tasks are built, one is supervised: Few-NERD (SUP) and the \nother two are few-shot: Few-NERD (INTRA) and Few-NERD (INTER).", "evaluation_metadata": {}}, "DFKI-SLT/mobie": {"name": "DFKI-SLT/mobie", "description": "MobIE is a German-language dataset which is human-annotated with 20 coarse- and fine-grained entity types and entity linking information for geographically linkable entities. The dataset consists of 3,232 social media texts and traffic reports with 91K tokens, and contains 20.5K annotated entities, 13.1K of which are linked to a knowledge base. A subset of the dataset is human-annotated with seven mobility-related, n-ary relation types, while the remaining documents are annotated using a weakly-supervised labeling approach implemented with the Snorkel framework. The dataset combines annotations for NER, EL and RE, and thus can be used for joint and multi-task learning of these fundamental information extraction tasks.", "evaluation_metadata": {}}, "dk-crazydiv/huggingface-modelhub": {"name": "dk-crazydiv/huggingface-modelhub", "description": "Metadata information of all the models available on HuggingFace's modelhub", "evaluation_metadata": {}}, "dlb/plue": {"name": "dlb/plue", "description": "PLUE: Portuguese Language Understanding Evaluationis a Portuguese translation of \nthe GLUE benchmark and Scitail using OPUS-MT model and Google Cloud Translation.", "evaluation_metadata": {}}, "dynabench/dynasent": {"name": "dynabench/dynasent", "description": " Dynabench.DynaSent is a Sentiment Analysis dataset collected using a \n human-and-model-in-the-loop.", "evaluation_metadata": {}}, "dynabench/qa": {"name": "dynabench/qa", "description": " Dynabench.QA is a Reading Comprehension dataset collected using a human-and-model-in-the-loop.", "evaluation_metadata": {}}, "ebrigham/labels": {"name": "ebrigham/labels", "description": "AG is a collection of more than 1 million news articles. News articles have been\ngathered from more than 2000 news sources by ComeToMyHead in more than 1 year of\nactivity. ComeToMyHead is an academic news search engine which has been running\nsince July, 2004. The dataset is provided by the academic comunity for research\npurposes in data mining (clustering, classification, etc), information retrieval\n(ranking, search, etc), xml, data compression, data streaming, and any other\nnon-commercial activity. For more information, please refer to the link\nhttp://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .\nThe AG's news topic classification dataset is constructed by Xiang Zhang\n(xiang.zhang@nyu.edu) from the dataset above. It is used as a text\nclassification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann\nLeCun. Character-level Convolutional Networks for Text Classification. Advances\nin Neural Information Processing Systems 28 (NIPS 2015).", "evaluation_metadata": {}}, "echarlaix/gqa": {"name": "echarlaix/gqa", "description": "GQA is a new dataset for real-world visual reasoning and compositional question answering,\nseeking to address key shortcomings of previous visual question answering (VQA) datasets.", "evaluation_metadata": {}}, "echarlaix/vqa": {"name": "echarlaix/vqa", "description": "VQA is a new dataset containing open-ended questions about images. \nThese questions require an understanding of vision, language and commonsense knowledge to answer.", "evaluation_metadata": {}}, "edbeeching/decision_transformer_gym_replay": {"name": "edbeeching/decision_transformer_gym_replay", "description": "A subset of the D4RL dataset, used for training Decision Transformers", "evaluation_metadata": {}}, "eugenesiow/BSD100": {"name": "eugenesiow/BSD100", "description": "BSD is a dataset used frequently for image denoising and super-resolution. \nBSD100 is the testing set of the Berkeley segmentation dataset BSD300.", "evaluation_metadata": {}}, "eugenesiow/Div2k": {"name": "eugenesiow/Div2k", "description": "DIV2K dataset: DIVerse 2K resolution high quality images as used for the challenges @ NTIRE (CVPR 2017 and \nCVPR 2018) and @ PIRM (ECCV 2018)", "evaluation_metadata": {}}, "eugenesiow/PIRM": {"name": "eugenesiow/PIRM", "description": "The PIRM dataset consists of 200 images, which are divided into two equal sets for validation and testing. \nThese images cover diverse contents, including people, objects, environments, flora, natural scenery, etc. \nImages vary in size, and are typically ~300K pixels in resolution.\n\nThis dataset was first used for evaluating the perceptual quality of super-resolution algorithms in The 2018 PIRM \nchallenge on Perceptual Super-resolution, in conjunction with ECCV 2018.", "evaluation_metadata": {}}, "eugenesiow/Set14": {"name": "eugenesiow/Set14", "description": "Set14 is an evaluation dataset with 14 RGB images for the image super resolution task.", "evaluation_metadata": {}}, "eugenesiow/Set5": {"name": "eugenesiow/Set5", "description": "Set5 is a evaluation dataset with 5 RGB images for the image super resolution task.", "evaluation_metadata": {}}, "eugenesiow/Urban100": {"name": "eugenesiow/Urban100", "description": "The Urban100 dataset contains 100 images of urban scenes. \nIt commonly used as a test set to evaluate the performance of super-resolution models.", "evaluation_metadata": {}}, "facebook/multilingual_librispeech": {"name": "facebook/multilingual_librispeech", "description": "This is a streamable version of the Multilingual LibriSpeech (MLS) dataset. \nThe data archives were restructured from the original ones from [OpenSLR](http://www.openslr.org/94) \nto make it easier to stream. \n\nMLS dataset is a large multilingual corpus suitable for speech research. \nThe dataset is derived from read audiobooks from LibriVox and consists of 8 languages: \nEnglish, German, Dutch, Spanish, French, Italian, Portuguese, Polish.", "evaluation_metadata": {}}, "fhamborg/news_sentiment_newsmtsc": {"name": "fhamborg/news_sentiment_newsmtsc", "description": "NewsMTSC: A large, manually annotated dataset for target-dependent sentiment classification in English news articles.", "evaluation_metadata": {}}, "flax-community/german_common_crawl": {"name": "flax-community/german_common_crawl", "description": "German Only Extract from Common Crawl\n\nThis Dataset is for pretraining a German Language Model (Unsupervised) or tune a Multilingual Model specifically to German", "evaluation_metadata": {}}, "flax-community/swahili-safi": {"name": "flax-community/swahili-safi", "description": "Cleaned dataset for Swahili Language Modeling", "evaluation_metadata": {}}, "flax-sentence-embeddings/stackexchange_math_jsonl": {"name": "flax-sentence-embeddings/stackexchange_math_jsonl", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl": {"name": "flax-sentence-embeddings/stackexchange_title_best_voted_answer_jsonl", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl": {"name": "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl": {"name": "flax-sentence-embeddings/stackexchange_titlebody_best_voted_answer_jsonl", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "flexthink/librig2p-nostress-space": {"name": "flexthink/librig2p-nostress-space", "description": "Grapheme-to-Phoneme training, validation and test sets", "evaluation_metadata": {}}, "flexthink/librig2p-nostress": {"name": "flexthink/librig2p-nostress", "description": "Grapheme-to-Phoneme training, validation and test sets", "evaluation_metadata": {}}, "florianbussmann/FUNSD-vu2020revising": {"name": "florianbussmann/FUNSD-vu2020revising", "description": "\\\r\nFUNSD is one of the limited publicly available datasets for information extraction from document images.\r\nThe information in the FUNSD dataset is defined by text areas of four categories (\"key\", \"value\", \"header\", \"other\", and \"background\")\r\nand connectivity between areas as key-value relations. Inspecting FUNSD, we found several inconsistency in labeling, which impeded its\r\napplicability to the key-value extraction problem. In this report, we described some labeling issues in FUNSD and the revision we made\r\nto the dataset.", "evaluation_metadata": {}}, "florianbussmann/train_tickets-yu2020pick": {"name": "florianbussmann/train_tickets-yu2020pick", "description": "\\\r\nThe train ticket is fixed layout dataset, however, it contains background noise and imaging distortions.\r\nIt contains 1,530 synthetic images and 320 real images for training, and 80 real images for testing.\r\nEvery train ticket has eight key text fields including ticket number, starting station, train number, destination station, date, ticket rates, seat category, and name.\r\nThis dataset mainly consists of digits, English characters, and Chinese characters.", "evaluation_metadata": {}}, "frtna/jwt300_mt": {"name": "frtna/jwt300_mt", "description": "This new dataset is designed to be used in the scope of machine translation project.", "evaluation_metadata": {}}, "frtna/opensubtitles_mt": {"name": "frtna/opensubtitles_mt", "description": "This new dataset is designed to be used in the scope of PhD project.", "evaluation_metadata": {}}, "frtna/ted_mt": {"name": "frtna/ted_mt", "description": "This new dataset is designed to be used in the scope of multilingual model project.", "evaluation_metadata": {}}, "gcaillaut/citeseer": {"name": "gcaillaut/citeseer", "description": "The CiteSeer dataset consists of 3312 scientific publications classified into one of six classes. The citation network consists of 4732 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 3703 unique words. The README file in the dataset provides more details.", "evaluation_metadata": {}}, "gcaillaut/cora": {"name": "gcaillaut/cora", "description": "The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.", "evaluation_metadata": {}}, "gcaillaut/frwiki_good_pages_el": {"name": "gcaillaut/frwiki_good_pages_el", "description": "French Wikipedia dataset for Entity Linking", "evaluation_metadata": {}}, "gcaillaut/pubmed": {"name": "gcaillaut/pubmed", "description": "The Pubmed Diabetes dataset consists of 19717 scientific publications from PubMed database pertaining to diabetes classified into one of three classes. The citation network consists of 44338 links. Each publication in the dataset is described by a TF/IDF weighted word vector from a dictionary which consists of 500 unique words. The README file in the dataset provides more details.", "evaluation_metadata": {}}, "ghomasHudson/muld": {"name": "ghomasHudson/muld", "description": "MuLD: The Multitask Long Document Benchmark\nA set of NLP tasks where each example is over 10,000 tokens long.", "evaluation_metadata": {}}, "ghomasHudson/vlsp": {"name": "ghomasHudson/vlsp", "description": "Very Long version of the scientific papers summarization dataset. Only includes theses over 10,000 tokens long.", "evaluation_metadata": {}}, "gigant/romanian_speech_synthesis_0_8_1": {"name": "gigant/romanian_speech_synthesis_0_8_1", "description": "\\\r\nThe Romanian speech synthesis (RSS) corpus was recorded in a hemianechoic chamber (anechoic walls and ceiling; floor partially anechoic) at the University of Edinburgh. We used three high quality studio microphones: a Neumann u89i (large diaphragm condenser), a Sennheiser MKH 800 (small diaphragm condenser with very wide bandwidth) and a DPA 4035 (headset-mounted condenser). Although the current release includes only speech data recorded via Sennheiser MKH 800, we may release speech data recorded via other microphones in the future. All recordings were made at 96 kHz sampling frequency and 24 bits per sample, then downsampled to 48 kHz sampling frequency. For recording, downsampling and bit rate conversion, we used ProTools HD hardware and software. We conducted 8 sessions over the course of a month, recording about 500 sentences in each session. At the start of each session, the speaker listened to a previously recorded sample, in order to attain a similar voice quality and intonation.", "evaluation_metadata": {}}, "gmnlp/tico19": {"name": "gmnlp/tico19", "description": "In response to the on-going crisis, several academic (Carnegie Mellon University, \nGeorge Mason University, Johns Hopkins University) and industry (Amazon, Appen, \nFacebook, Google, Microsoft, Translated) partners have partnered with the Translators \nwithout Borders to prepare COVID-19 materials for a variety of the world\u2019s languages \nto be used by professional translators and for training state-of-the-art Machine \nTranslation (MT) models. The focus is on making emergency and crisis-related content \navailable in as many languages as possible. The collected, curated and translated \ncontent across nearly 90 languages will be available to the professional translation \nas well the MT research community.", "evaluation_metadata": {}}, "gsarti/clean_mc4_it": {"name": "gsarti/clean_mc4_it", "description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "evaluation_metadata": {}}, "gsarti/flores_101": {"name": "gsarti/flores_101", "description": "One of the biggest challenges hindering progress in low-resource and multilingual machine translation is the \nlack of good evaluation benchmarks. Current evaluation benchmarks either lack good coverage of low-resource \nlanguages, consider only restricted domains, or are low quality because they are constructed using \nsemi-automatic procedures. In this work, we introduce the FLORES evaluation benchmark, consisting of 3001 \nsentences extracted from English Wikipedia and covering a variety of different topics and domains. \nThese sentences have been translated in 101 languages by professional translators through a carefully \ncontrolled process. The resulting dataset enables better assessment of model quality on the long tail of \nlow-resource languages, including the evaluation of many-to-many multilingual translation systems, as all \ntranslations are multilingually aligned. By publicly releasing such a high-quality and high-coverage dataset, \nwe hope to foster progress in the machine translation community and beyond.", "evaluation_metadata": {}}, "gsarti/itacola": {"name": "gsarti/itacola", "description": "The Italian Corpus of Linguistic Acceptability includes almost 10k sentences taken from \nlinguistic literature with a binary annotation made by the original authors themselves. \nThe work is inspired by the English Corpus of Linguistic Acceptability (CoLA) by Warstadt et al.\nPart of the dataset has been manually annotated to highlight 9 linguistic phenomena.", "evaluation_metadata": {}}, "gsarti/wmt_vat": {"name": "gsarti/wmt_vat", "description": "The Variance-Aware Machine Translation corpus contains 70 small and discriminative test sets for machine translation (MT) \nevaluation called variance-aware test sets (VAT), covering 35 translation directions from WMT16 to WMT20 competitions. \nVAT is automatically created by a novel variance-aware filtering method that filters the indiscriminative test instances \nof the current MT benchmark without any human labor. Experimental results show that VAT outperforms the original WMT benchmark \nin terms of the correlation with human judgment across mainstream language pairs and test sets. Further analysis on the properties \nof VAT reveals the challenging linguistic features (e.g., translation of low-frequency words and proper nouns) for the competitive \nMT systems, providing guidance for constructing future MT test sets.", "evaluation_metadata": {}}, "hf-internal-testing/librispeech_asr_demo": {"name": "hf-internal-testing/librispeech_asr_demo", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "hf-internal-testing/librispeech_asr_dummy": {"name": "hf-internal-testing/librispeech_asr_dummy", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "huggingartists/100-gecs": {"name": "huggingartists/100-gecs", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/21-savage": {"name": "huggingartists/21-savage", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/25-17": {"name": "huggingartists/25-17", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/50-cent": {"name": "huggingartists/50-cent", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/5nizza": {"name": "huggingartists/5nizza", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/5opka": {"name": "huggingartists/5opka", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/6ix9ine": {"name": "huggingartists/6ix9ine", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/aaron-watson": {"name": "huggingartists/aaron-watson", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/abba": {"name": "huggingartists/abba", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/adele": {"name": "huggingartists/adele", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/agata-christie": {"name": "huggingartists/agata-christie", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/aikko": {"name": "huggingartists/aikko", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/aimer": {"name": "huggingartists/aimer", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ajr": {"name": "huggingartists/ajr", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/alan-walker": {"name": "huggingartists/alan-walker", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/andre-3000": {"name": "huggingartists/andre-3000", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/arash": {"name": "huggingartists/arash", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/architects": {"name": "huggingartists/architects", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/arctic-monkeys": {"name": "huggingartists/arctic-monkeys", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ariana-grande": {"name": "huggingartists/ariana-grande", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ariya": {"name": "huggingartists/ariya", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/armin-van-buuren": {"name": "huggingartists/armin-van-buuren", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/as-i-lay-dying": {"name": "huggingartists/as-i-lay-dying", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/asdfgfa": {"name": "huggingartists/asdfgfa", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/asper-x": {"name": "huggingartists/asper-x", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/baklan": {"name": "huggingartists/baklan", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/big-baby-tape": {"name": "huggingartists/big-baby-tape", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/big-russian-boss": {"name": "huggingartists/big-russian-boss", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bill-wurtz": {"name": "huggingartists/bill-wurtz", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/billie-eilish": {"name": "huggingartists/billie-eilish", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/billy-talent": {"name": "huggingartists/billy-talent", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bladee": {"name": "huggingartists/bladee", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bob-dylan": {"name": "huggingartists/bob-dylan", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bones": {"name": "huggingartists/bones", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/booker": {"name": "huggingartists/booker", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/boris-grebenshikov": {"name": "huggingartists/boris-grebenshikov", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/braii": {"name": "huggingartists/braii", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bring-me-the-horizon": {"name": "huggingartists/bring-me-the-horizon", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bruce-springsteen": {"name": "huggingartists/bruce-springsteen", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bryan-adams": {"name": "huggingartists/bryan-adams", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/burzum": {"name": "huggingartists/burzum", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/bushido-zho": {"name": "huggingartists/bushido-zho", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/cardi-b": {"name": "huggingartists/cardi-b", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/chester-bennington": {"name": "huggingartists/chester-bennington", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/chief-keef": {"name": "huggingartists/chief-keef", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/cocomelon": {"name": "huggingartists/cocomelon", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/coin": {"name": "huggingartists/coin", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/coldplay": {"name": "huggingartists/coldplay", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/dababy": {"name": "huggingartists/dababy", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/david-bowie": {"name": "huggingartists/david-bowie", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ddt": {"name": "huggingartists/ddt", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/death-grips": {"name": "huggingartists/death-grips", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/deep-purple": {"name": "huggingartists/deep-purple", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/denderty": {"name": "huggingartists/denderty", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/dermot-kennedy": {"name": "huggingartists/dermot-kennedy", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/dj-artem-artemov": {"name": "huggingartists/dj-artem-artemov", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/doja-cat": {"name": "huggingartists/doja-cat", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/drake": {"name": "huggingartists/drake", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/dua-lipa": {"name": "huggingartists/dua-lipa", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/duran-duran": {"name": "huggingartists/duran-duran", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/dzhizus": {"name": "huggingartists/dzhizus", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ed-sheeran": {"name": "huggingartists/ed-sheeran", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/egor-kreed": {"name": "huggingartists/egor-kreed", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/egor-letov": {"name": "huggingartists/egor-letov", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/elton-john": {"name": "huggingartists/elton-john", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/eminem": {"name": "huggingartists/eminem", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/enigma": {"name": "huggingartists/enigma", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/enya": {"name": "huggingartists/enya", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/epic-rap-battles-of-history": {"name": "huggingartists/epic-rap-battles-of-history", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/face": {"name": "huggingartists/face", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/fascinoma": {"name": "huggingartists/fascinoma", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/fear-factory": {"name": "huggingartists/fear-factory", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/florence-the-machine": {"name": "huggingartists/florence-the-machine", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/freddie-dredd": {"name": "huggingartists/freddie-dredd", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/freelancer": {"name": "huggingartists/freelancer", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/galenskaparna-and-after-shave": {"name": "huggingartists/galenskaparna-and-after-shave", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ghost": {"name": "huggingartists/ghost", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ghostemane": {"name": "huggingartists/ghostemane", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ghostmane": {"name": "huggingartists/ghostmane", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/gizmo": {"name": "huggingartists/gizmo", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/gorillaz": {"name": "huggingartists/gorillaz", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/green-day": {"name": "huggingartists/green-day", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/grigory-leps": {"name": "huggingartists/grigory-leps", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/grimes": {"name": "huggingartists/grimes", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/gspd": {"name": "huggingartists/gspd", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/gunna": {"name": "huggingartists/gunna", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/hyuna": {"name": "huggingartists/hyuna", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/i-dont-know-how-but-they-found-me": {"name": "huggingartists/i-dont-know-how-but-they-found-me", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/idktime": {"name": "huggingartists/idktime", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/imagine-dragons": {"name": "huggingartists/imagine-dragons", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/jah-khalib": {"name": "huggingartists/jah-khalib", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/jim-morrison": {"name": "huggingartists/jim-morrison", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/john-k-samson": {"name": "huggingartists/john-k-samson", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/john-lennon": {"name": "huggingartists/john-lennon", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/joji": {"name": "huggingartists/joji", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/joni-mitchell": {"name": "huggingartists/joni-mitchell", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/justin-bieber": {"name": "huggingartists/justin-bieber", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kanye-west": {"name": "huggingartists/kanye-west", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kasta": {"name": "huggingartists/kasta", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/katy-perry": {"name": "huggingartists/katy-perry", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kehlani": {"name": "huggingartists/kehlani", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kendrick-lamar": {"name": "huggingartists/kendrick-lamar", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kesha": {"name": "huggingartists/kesha", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/king-krule": {"name": "huggingartists/king-krule", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kipelov": {"name": "huggingartists/kipelov", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kishlak": {"name": "huggingartists/kishlak", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kizaru": {"name": "huggingartists/kizaru", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kojey-radical": {"name": "huggingartists/kojey-radical", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/krechet": {"name": "huggingartists/krechet", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/krept-and-konan-bugzy-malone-sl-morisson-abra-cadabra-rv-and-snap-capone": {"name": "huggingartists/krept-and-konan-bugzy-malone-sl-morisson-abra-cadabra-rv-and-snap-capone", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/kurt-cobain": {"name": "huggingartists/kurt-cobain", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lady-gaga": {"name": "huggingartists/lady-gaga", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lazy-jay": {"name": "huggingartists/lazy-jay", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/led-zeppelin": {"name": "huggingartists/led-zeppelin", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lil-baby": {"name": "huggingartists/lil-baby", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lil-nas-x": {"name": "huggingartists/lil-nas-x", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lil-peep": {"name": "huggingartists/lil-peep", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lil-skies": {"name": "huggingartists/lil-skies", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lil-uzi-vert": {"name": "huggingartists/lil-uzi-vert", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/linkin-park": {"name": "huggingartists/linkin-park", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/little-big": {"name": "huggingartists/little-big", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lizer": {"name": "huggingartists/lizer", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/logic": {"name": "huggingartists/logic", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lorde": {"name": "huggingartists/lorde", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/loud-luxury": {"name": "huggingartists/loud-luxury", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/loverance": {"name": "huggingartists/loverance", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lovv66": {"name": "huggingartists/lovv66", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lumen": {"name": "huggingartists/lumen", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/lyapis-trubetskoy": {"name": "huggingartists/lyapis-trubetskoy", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/macan": {"name": "huggingartists/macan", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/machine-gun-kelly": {"name": "huggingartists/machine-gun-kelly", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/madonna": {"name": "huggingartists/madonna", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/marillion": {"name": "huggingartists/marillion", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/maroon-5": {"name": "huggingartists/maroon-5", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mashina-vremeni": {"name": "huggingartists/mashina-vremeni", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mating-ritual": {"name": "huggingartists/mating-ritual", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/max-korzh": {"name": "huggingartists/max-korzh", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mayot": {"name": "huggingartists/mayot", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mc-ride": {"name": "huggingartists/mc-ride", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/melanie-martinez": {"name": "huggingartists/melanie-martinez", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/metallica": {"name": "huggingartists/metallica", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mf-doom": {"name": "huggingartists/mf-doom", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/michael-jackson": {"name": "huggingartists/michael-jackson", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mikhail-gorshenev": {"name": "huggingartists/mikhail-gorshenev", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mikhail-krug": {"name": "huggingartists/mikhail-krug", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/miyagi": {"name": "huggingartists/miyagi", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mnogoznaal": {"name": "huggingartists/mnogoznaal", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/morgenshtern": {"name": "huggingartists/morgenshtern", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/mumiy-troll": {"name": "huggingartists/mumiy-troll", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/muse": {"name": "huggingartists/muse", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/nautilus-pompilius": {"name": "huggingartists/nautilus-pompilius", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/nervy": {"name": "huggingartists/nervy", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/nicki-minaj": {"name": "huggingartists/nicki-minaj", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/nirvana": {"name": "huggingartists/nirvana", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/noize-mc": {"name": "huggingartists/noize-mc", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/oasis": {"name": "huggingartists/oasis", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/obladaet": {"name": "huggingartists/obladaet", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/og-buda": {"name": "huggingartists/og-buda", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ot-rus": {"name": "huggingartists/ot-rus", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/our-last-night": {"name": "huggingartists/our-last-night", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/oxxxymiron": {"name": "huggingartists/oxxxymiron", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/peter-paul-and-mary": {"name": "huggingartists/peter-paul-and-mary", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/pharaoh": {"name": "huggingartists/pharaoh", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/phish": {"name": "huggingartists/phish", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/pink-floyd": {"name": "huggingartists/pink-floyd", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/placebo": {"name": "huggingartists/placebo", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/platina": {"name": "huggingartists/platina", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/pop-smoke": {"name": "huggingartists/pop-smoke", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/post-malone": {"name": "huggingartists/post-malone", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/pyrokinesis": {"name": "huggingartists/pyrokinesis", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/queen": {"name": "huggingartists/queen", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/radiohead": {"name": "huggingartists/radiohead", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/rage-against-the-machine": {"name": "huggingartists/rage-against-the-machine", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/ramil": {"name": "huggingartists/ramil", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/rammstein": {"name": "huggingartists/rammstein", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/red-hot-chili-peppers": {"name": "huggingartists/red-hot-chili-peppers", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/rex-orange-county": {"name": "huggingartists/rex-orange-county", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/rihanna": {"name": "huggingartists/rihanna", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/rocket": {"name": "huggingartists/rocket", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sam-kim": {"name": "huggingartists/sam-kim", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/scriptonite": {"name": "huggingartists/scriptonite", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sektor-gaza": {"name": "huggingartists/sektor-gaza", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/selena-gomez": {"name": "huggingartists/selena-gomez", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sergei-letov": {"name": "huggingartists/sergei-letov", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/shadowraze": {"name": "huggingartists/shadowraze", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sia": {"name": "huggingartists/sia", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sid-sriram": {"name": "huggingartists/sid-sriram", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/skillet": {"name": "huggingartists/skillet", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/slava-kpss": {"name": "huggingartists/slava-kpss", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/slava-marlow": {"name": "huggingartists/slava-marlow", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/snoop-dogg": {"name": "huggingartists/snoop-dogg", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sqwore": {"name": "huggingartists/sqwore", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sugar-ray": {"name": "huggingartists/sugar-ray", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/suicideoscope": {"name": "huggingartists/suicideoscope", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sum-41": {"name": "huggingartists/sum-41", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/sundara-karma": {"name": "huggingartists/sundara-karma", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/system-of-a-down": {"name": "huggingartists/system-of-a-down", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/t-fest": {"name": "huggingartists/t-fest", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/tanzy-minus": {"name": "huggingartists/tanzy-minus", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/taylor-swift": {"name": "huggingartists/taylor-swift", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/tedeschi-trucks-band": {"name": "huggingartists/tedeschi-trucks-band", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-69-eyes": {"name": "huggingartists/the-69-eyes", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-avalanches": {"name": "huggingartists/the-avalanches", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-beatles": {"name": "huggingartists/the-beatles", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-gazette": {"name": "huggingartists/the-gazette", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-grateful-dead": {"name": "huggingartists/the-grateful-dead", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-king-and-the-jester": {"name": "huggingartists/the-king-and-the-jester", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-notorious-big": {"name": "huggingartists/the-notorious-big", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-sugarcubes": {"name": "huggingartists/the-sugarcubes", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-the-pigs": {"name": "huggingartists/the-the-pigs", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-velvet-underground": {"name": "huggingartists/the-velvet-underground", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/the-weeknd": {"name": "huggingartists/the-weeknd", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/tiamat": {"name": "huggingartists/tiamat", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/till-lindemann": {"name": "huggingartists/till-lindemann", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/tom-waits": {"name": "huggingartists/tom-waits", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/tony-raut-and-garry-topor": {"name": "huggingartists/tony-raut-and-garry-topor", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/tool": {"name": "huggingartists/tool", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/totpoc": {"name": "huggingartists/totpoc", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/travis-scott": {"name": "huggingartists/travis-scott", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/twenty-one-pilots": {"name": "huggingartists/twenty-one-pilots", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/tyler-the-creator": {"name": "huggingartists/tyler-the-creator", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/upsahl": {"name": "huggingartists/upsahl", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/v-x-v-prince": {"name": "huggingartists/v-x-v-prince", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/van-morrison": {"name": "huggingartists/van-morrison", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/veggietales": {"name": "huggingartists/veggietales", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/viktor-tsoi": {"name": "huggingartists/viktor-tsoi", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/vladimir-vysotsky": {"name": "huggingartists/vladimir-vysotsky", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/xxxtentacion": {"name": "huggingartists/xxxtentacion", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/young-thug": {"name": "huggingartists/young-thug", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/yung-lean": {"name": "huggingartists/yung-lean", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/yung-plague": {"name": "huggingartists/yung-plague", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "huggingartists/zemfira": {"name": "huggingartists/zemfira", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "it5/datasets": {"name": "it5/datasets", "description": "\"\"\"\n\n_HOMEPAGE = \"\"\n\n_LICENSE = \"\"\n\n_BASE_URL = \"https://huggingface.co/datasets/it5/datasets/resolve/main/data/{config}_{split}.json.gz\"\n\n# Formality Style Transfer with XFormal\n_FST_SPLITS = [\"train\", \"valid\", \"test_0\", \"test_1\", \"test_2\", \"test_3\"]\n\n# Headline Generation with CHANGE-it\n_HG_SPLITS = [\"train\", \"valid\", \"test\"]\n\n# News Summarization with Fanpage/IlPost\n_NS_SPLITS = [\"train\", \"valid\", \"test_fanpage\", \"test_ilpost\"]\n\n# Question Answering with SQUAD-it\n_QA_SPLITS = [\"train\", \"valid\", \"test\"]\n\n# Question Generation with SQUAD-it\n_QG_SPLITS = [\"train\", \"valid\", \"test\"]\n\n# Headline Style Transfer Giornale -> Repubblica with CHANGE-it\n_ST_G2R_SPLITS = [\"train\", \"valid\", \"test\"]\n\n# Headline Style Transfer Repubblica -> Giornale with CHANGE-it\n_ST_R2G_SPLITS = [\"train\", \"valid\", \"test\"]\n\n# Wikipedia Summarization with WITS\n_WITS_SPLITS = [\"train\", \"valid\", \"test\"]\n\n_CONFIG_SPLITS = {\n \"fst\": _FST_SPLITS,\n \"hg\": _HG_SPLITS,\n \"ns\": _NS_SPLITS,\n \"qa\": _QA_SPLITS,\n \"qg\": _QG_SPLITS,\n \"st_g2r\": _ST_G2R_SPLITS,\n \"st_r2g\": _ST_R2G_SPLITS,\n \"wits\": _WITS_SPLITS,\n}\n\n_CONFIG_FEATS = {\n \"fst\": [\"formal\", \"informal\"],\n \"hg\": [\"text\", \"target\"],\n \"ns\": [\"source\", \"target\"],\n \"qa\": [\"source\", \"target\"],\n \"qg\": [\"text\", \"target\"],\n \"st_g2r\": [\"headline\", \"full_text\"],\n \"st_r2g\": [\"headline\", \"full_text\"],\n \"wits\": [\"summary\", \"source\"]\n}\n\nclass IT5ExperimentsConfig(datasets.BuilderConfig):\n\n def __init__(self, features, **kwargs):", "evaluation_metadata": {}}, "jegormeister/dutch-snli": {"name": "jegormeister/dutch-snli", "description": "This is the Dutch version of the original SNLI dataset. The translation was performed using Google Translate. Original SNLI available at https://nlp.stanford.edu/projects/snli/", "evaluation_metadata": {}}, "jfrenz/legalglue": {"name": "jfrenz/legalglue", "description": "\\\r\nLegal General Language Understanding Evaluation (LegalGLUE) benchmark is\r\na collection of datasets for evaluating model performance across a diverse set of legal NLP tasks", "evaluation_metadata": {}}, "jglaser/binding_affinity": {"name": "jglaser/binding_affinity", "description": "A dataset to fine-tune language models on protein-ligand binding affinity prediction.", "evaluation_metadata": {}}, "jimregan/foinse": {"name": "jimregan/foinse", "description": "Foinse was an Irish-language magazine site.\nThis script uses a list of articles retrieved from the\nWayback Machine to build a corpus", "evaluation_metadata": {}}, "jimregan/lasid": {"name": "jimregan/lasid", "description": "Linguistic Atlas and Survey of Irish Dialects, volume 1", "evaluation_metadata": {}}, "jinmang2/KorQuADv1": {"name": "jinmang2/KorQuADv1", "description": "KorQuAD 1.0 (Korean Question Answering Dataset v1.0)\nKorQuAD 1.0 is a dataset created for Korean Machine Reading Comprehension.\nThe answers to all your questions are made up of some subareas in the corresponding Wikipedia article paragraphs.\nIt is structured in the same way as the Stanford Question Answering Dataset (SQuAD) v1.0.", "evaluation_metadata": {}}, "joelito/ler": {"name": "joelito/ler", "description": "We describe a dataset developed for Named Entity Recognition in German federal court decisions. \nIt consists of approx. 67,000 sentences with over 2 million tokens. \nThe resource contains 54,000 manually annotated entities, mapped to 19 fine-grained semantic classes: \nperson, judge, lawyer, country, city, street, landscape, organization, company, institution, court, brand, law, \nordinance, European legal norm, regulation, contract, court decision, and legal literature. \nThe legal documents were, furthermore, automatically annotated with more than 35,000 TimeML-based time expressions. \nThe dataset, which is available under a CC-BY 4.0 license in the CoNNL-2002 format, \nwas developed for training an NER service for German legal documents in the EU project Lynx.", "evaluation_metadata": {}}, "joelito/sem_eval_2010_task_8": {"name": "joelito/sem_eval_2010_task_8", "description": "The SemEval-2010 Task 8 focuses on Multi-way classification of semantic relations between pairs of nominals.\nThe task was designed to compare different approaches to semantic relation classification\nand to provide a standard testbed for future research.", "evaluation_metadata": {}}, "ju-bezdek/conll2003-SK-NER": {"name": "ju-bezdek/conll2003-SK-NER", "description": "This is translated version of the original CONLL2003 dataset (translated from English to Slovak via Google translate) Annotation was done mostly automatically with word matching scripts. Records where some tags were not matched, were annotated manually (10%) Unlike the original Conll2003 dataset, this one contains only NER tags", "evaluation_metadata": {}}, "juny116/few_glue": {"name": "juny116/few_glue", "description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.", "evaluation_metadata": {}}, "k-halid/ar": {"name": "k-halid/ar", "description": "The corpus is a part of the MultiUN corpus.It is a collection of translated documents from the United Nations.The corpus is download from the following website : [open parallel corpus](http://opus.datasetsl.eu/) \\", "evaluation_metadata": {}}, "keshan/clean-si-mc4": {"name": "keshan/clean-si-mc4", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI.", "evaluation_metadata": {}}, "keshan/multispeaker-tts-sinhala": {"name": "keshan/multispeaker-tts-sinhala", "description": "\\\\nThis data set contains multi-speaker high quality transcribed audio data for Sinhala. The data set consists of wave files, and a TSV file. \nThe file si_lk.lines.txt contains a FileID, which in tern contains the UserID and the Transcription of audio in the file.\nThe data set has been manually quality checked, but there might still be errors.\n\nPart of this dataset was collected by Google in Sri Lanka and the rest was contributed by Path to Nirvana organization.", "evaluation_metadata": {}}, "kiamehr74/CoarseWSD-20": {"name": "kiamehr74/CoarseWSD-20", "description": "The CoarseWSD-20 dataset is a coarse-grained sense disambiguation built from Wikipedia \n(nouns only) targetting 2 to 5 senses of 20 ambiguous words. It was specifically designed \nto provide an ideal setting for evaluating WSD models (e.g. no senses in test sets missing \nfrom training), both quantitavely and qualitatively.", "evaluation_metadata": {}}, "kleinay/qa_srl": {"name": "kleinay/qa_srl", "description": "The dataset contains question-answer pairs to model verbal predicate-argument structure. \nThe questions start with wh-words (Who, What, Where, What, etc.) and contain a verb predicate in the sentence; the answers are phrases in the sentence.\nThis dataset loads the train split from \"QASRL Bank\", a.k.a \"QASRL-v2\" or \"QASRL-LS\" (Large Scale), \nwhich was constructed via crowdsourcing and presented at (FitzGeralds et. al., ACL 2018), \nand the dev and test splits from QASRL-GS (Gold Standard), introduced in (Roit et. al., ACL 2020).", "evaluation_metadata": {}}, "kresnik/librispeech_asr_test": {"name": "kresnik/librispeech_asr_test", "description": "\\\r\nLibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\r\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\r\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.\r\nNote that in order to limit the required storage for preparing this dataset, the audio\r\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\r\nfile to a float32 array, please make use of the `.map()` function as follows:\r\n```python\r\nimport soundfile as sf\r\ndef map_to_array(batch):\r\n speech_array, _ = sf.read(batch[\"file\"])\r\n batch[\"speech\"] = speech_array\r\n return batch\r\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\r\n```", "evaluation_metadata": {}}, "lavis-nlp/german_legal_sentences": {"name": "lavis-nlp/german_legal_sentences", "description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)", "evaluation_metadata": {}}, "leonadase/mycoll3": {"name": "leonadase/mycoll3", "description": "\\\r\nThe shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\r\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\r\nnot belong to the previous three groups.\r\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\r\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\r\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\r\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\r\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\r\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\r\ntagging scheme, whereas the original dataset uses IOB1.\r\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "lewtun/mnist-preds": {"name": "lewtun/mnist-preds", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "lhoestq/custom_squad": {"name": "lhoestq/custom_squad", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "lhoestq/squad": {"name": "lhoestq/squad", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "lhoestq/test": {"name": "lhoestq/test", "description": "This is a test dataset.", "evaluation_metadata": {}}, "lhoestq/wikipedia_bn": {"name": "lhoestq/wikipedia_bn", "description": "Bengali Wikipedia from the dump of 03/20/2021.\nThe data was processed using the huggingface datasets wikipedia script early april 2021.\nThe dataset was built from the Wikipedia dump (https://dumps.wikimedia.org/).\nEach example contains the content of one full Wikipedia article with cleaning to strip\nmarkdown and unwanted sections (references, etc.).", "evaluation_metadata": {}}, "liweili/c4_200m": {"name": "liweili/c4_200m", "description": "\\\r\nGEC Dataset Generated from C4", "evaluation_metadata": {}}, "codeparrot/github-code": {"name": "codeparrot/github-code", "description": "The GitHub Code dataest consists of 115M code files from GitHub in 32 programming languages with 60 extensions totalling in 1TB of text data. The dataset was created from the GitHub dataset on BiqQuery.", "evaluation_metadata": {}}, "m3hrdadfi/recipe_nlg_lite": {"name": "m3hrdadfi/recipe_nlg_lite", "description": "RecipeNLG: A Cooking Recipes Dataset for Semi-Structured Text Generation - Lite version\nThe dataset we publish contains 7,198 cooking recipes (>7K). \nIt's processed in more careful way and provides more samples than any other dataset in the area.", "evaluation_metadata": {}}, "masked-neuron/ccd": {"name": "masked-neuron/ccd", "description": "The consumer compaint data set is derived from the consumer complaint database\nfor the purpose of benchmarking quantification / label shift algorithms. The\ndata set consists of records of compaints about consumer financial products and\nservices that the Consumer Financial Protection Bureau sent to companies for \nresponse. Each record has a corresponding product / sub product field which can\nbe used as labels for text classification.", "evaluation_metadata": {}}, "maximedb/mcqa_light": {"name": "maximedb/mcqa_light", "description": "MQA is a multilingual corpus of questions and answers parsed from the Common Crawl. Questions are divided between Frequently Asked Questions (FAQ) pages and Community Question Answering (CQA) pages.", "evaluation_metadata": {}}, "maximedb/mfaq_light": {"name": "maximedb/mfaq_light", "description": "MQA is a multilingual corpus of questions and answers parsed from the Common Crawl. Questions are divided between Frequently Asked Questions (FAQ) pages and Community Question Answering (CQA) pages.", "evaluation_metadata": {}}, "maximedb/paws-x-all": {"name": "maximedb/paws-x-all", "description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "evaluation_metadata": {}}, "maximedb/wow": {"name": "maximedb/wow", "description": "In open-domain dialogue intelligent agents should exhibit the use of knowledge, however there are few convincing demonstrations of this to date. The most popular sequence to sequence models typically \"generate and hope\" generic utterances that can be memorized in the weights of the model when mapping from input utterance(s) to output, rather than employing recalled knowledge as context. Use of knowledge has so far proved difficult, in part because of the lack of a supervised learning benchmark task which exhibits knowledgeable open dialogue with clear grounding. To that end we collect and release a large dataset with conversations directly grounded with knowledge retrieved from Wikipedia. We then design architectures capable of retrieving knowledge, reading and conditioning on it, and finally generating natural responses. Our best performing dialogue models are able to conduct knowledgeable discussions on open-domain topics as evaluated by automatic metrics and human evaluations, while our new benchmark allows for measuring further improvements in this important research direction.", "evaluation_metadata": {}}, "metaeval/blimp_classification": {"name": "metaeval/blimp_classification", "description": "Acceptable/non acceptable sentences (recasted as a classification task)", "evaluation_metadata": {}}, "tasksource/crowdflower": {"name": "tasksource/crowdflower", "description": "Collection of crowdflower classification datasets", "evaluation_metadata": {}}, "metaeval/linguisticprobing": {"name": "metaeval/linguisticprobing", "description": "10 probing tasks designed to capture simple linguistic features of sentences,", "evaluation_metadata": {}}, "metaeval/recast": {"name": "metaeval/recast", "description": "A diverse collection of tasks recasted as natural language inference tasks.", "evaluation_metadata": {}}, "midas/inspec": {"name": "midas/inspec", "description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.", "evaluation_metadata": {}}, "midas/ldkp10k": {"name": "midas/ldkp10k", "description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "midas/ldkp3k": {"name": "midas/ldkp3k", "description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "midas/test_ldkp": {"name": "midas/test_ldkp", "description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "mideind/icelandic-error-corpus-IceEC": {"name": "mideind/icelandic-error-corpus-IceEC", "description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.", "evaluation_metadata": {}}, "ml6team/cnn_dailymail_nl": {"name": "ml6team/cnn_dailymail_nl", "description": " This dataset is the CNN/Dailymail dataset translated to Dutch.\n This is the original dataset:\n ```\n load_dataset(\"cnn_dailymail\", '3.0.0')\n ```\n And this is the HuggingFace translation pipeline:\n ```\n pipeline(\n task='translation_en_to_nl',\n model='Helsinki-NLP/opus-mt-en-nl',\n tokenizer='Helsinki-NLP/opus-mt-en-nl')\n ```", "evaluation_metadata": {}}, "mldmm/glass_alloy_composition": {"name": "mldmm/glass_alloy_composition", "description": "This is an alloy composition dataset", "evaluation_metadata": {}}, "mnemlaghi/widdd": {"name": "mnemlaghi/widdd", "description": "WiDDD stands for WIkiData Disambig with Descriptions. The former dataset comes from [Cetoli & al](https://arxiv.org/pdf/1810.09164.pdf) paper, and is aimed at solving Named Entity Disambiguation. This datasets tries to extract relevant information from entities descriptions only, instead of working with graphs. In order to do so, we mapped every Wikidata id (correct id and wrong id) in the original paper with its WikiData description. If not found, row is discarded for this version.", "evaluation_metadata": {}}, "indonesian-nlp/mc4-id": {"name": "indonesian-nlp/mc4-id", "description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "evaluation_metadata": {}}, "mvarma/medwiki": {"name": "mvarma/medwiki", "description": "MedWiki is a large-scale sentence dataset collected from Wikipedia with medical entity (UMLS) annotations. This dataset is intended for pretraining.", "evaluation_metadata": {}}, "ncoop57/csnc_human_judgement": {"name": "ncoop57/csnc_human_judgement", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "ncoop57/rico_captions": {"name": "ncoop57/rico_captions", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "nthngdy/ccnews_split": {"name": "nthngdy/ccnews_split", "description": "CC-News containing news articles from news sites all over the world The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. This version of the dataset has 708241 articles. It represents a small portion of English language subset of the CC-News dataset created using news-please(Hamborg et al.,2017) to collect and extract English language portion of CC-News.", "evaluation_metadata": {}}, "oscar-corpus/OSCAR-2109": {"name": "oscar-corpus/OSCAR-2109", "description": "The Open Super-large Crawled Aggregated coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.\\", "evaluation_metadata": {}}, "ought/raft": {"name": "ought/raft", "description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)", "evaluation_metadata": {}}, "pasinit/scotus": {"name": "pasinit/scotus", "description": "Dataset extracted from case laws of Supreme Court of United States.", "evaluation_metadata": {}}, "pasinit/xlwic": {"name": "pasinit/xlwic", "description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)", "evaluation_metadata": {}}, "patrickvonplaten/librispeech_asr_dummy": {"name": "patrickvonplaten/librispeech_asr_dummy", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "patrickvonplaten/scientific_papers_dummy": {"name": "patrickvonplaten/scientific_papers_dummy", "description": "Scientific papers datasets contains two sets of long and structured documents.\nThe datasets are obtained from ArXiv and PubMed OpenAccess repositories.\n\nBoth \"arxiv\" and \"pubmed\" have two features:\n - article: the body of the document, pagragraphs seperated by \"/n\".\n - abstract: the abstract of the document, pagragraphs seperated by \"/n\".\n - section_names: titles of sections, seperated by \"/n\".", "evaluation_metadata": {}}, "peixian/equity_evaluation_corpus": {"name": "peixian/equity_evaluation_corpus", "description": "Automatic machine learning systems can inadvertently accentuate and perpetuate inappropriate human biases. Past work on examining inappropriate biases has largely focused on just individual systems and resources. Further, there is a lack of benchmark datasets for examining inappropriate biases in system predictions. Here, we present the Equity Evaluation Corpus (EEC), which consists of 8,640 English sentences carefully chosen to tease out biases towards certain races and genders. We used the dataset to examine 219 automatic sentiment analysis systems that took part in a recent shared task, SemEval-2018 Task 1 \u2018Affect in Tweets\u2019. We found that several of the systems showed statistically significant bias; that is, they consistently provide slightly higher sentiment intensity predictions for one race or one gender. We make the EEC freely available, and encourage its use to evaluate biases in sentiment and other NLP tasks.", "evaluation_metadata": {}}, "persiannlp/parsinlu_entailment": {"name": "persiannlp/parsinlu_entailment", "description": "A Persian textual entailment task (deciding `sent1` entails `sent2`).", "evaluation_metadata": {}}, "persiannlp/parsinlu_query_paraphrasing": {"name": "persiannlp/parsinlu_query_paraphrasing", "description": "A Persian query paraphrasing task (paraphrase or not, given two questions). \nThe questions are partly mined using Google auto-complete, and partly translated from Quora paraphrasing dataset.", "evaluation_metadata": {}}, "persiannlp/parsinlu_reading_comprehension": {"name": "persiannlp/parsinlu_reading_comprehension", "description": "A Persian reading comprehenion task (generating an answer, given a question and a context paragraph). \nThe questions are mined using Google auto-complete, their answers and the corresponding evidence documents are manually annotated by native speakers.", "evaluation_metadata": {}}, "persiannlp/parsinlu_sentiment": {"name": "persiannlp/parsinlu_sentiment", "description": "A Persian sentiment analysis task (deciding whether a given sentence contains a particular sentiment).", "evaluation_metadata": {}}, "persiannlp/parsinlu_translation_en_fa": {"name": "persiannlp/parsinlu_translation_en_fa", "description": "A Persian translation dataset (English -> Persian).", "evaluation_metadata": {}}, "persiannlp/parsinlu_translation_fa_en": {"name": "persiannlp/parsinlu_translation_fa_en", "description": "A Persian translation dataset (Persian -> English).", "evaluation_metadata": {}}, "piEsposito/br_quad_20": {"name": "piEsposito/br_quad_20", "description": "Translates SQuAD 2.0 from english to portuguese using Google Cloud API", "evaluation_metadata": {}}, "piEsposito/squad_20_ptbr": {"name": "piEsposito/squad_20_ptbr", "description": "Translates SQuAD 2.0 from english to portuguese using Google Cloud API", "evaluation_metadata": {}}, "pile-of-law/pile-of-law": {"name": "pile-of-law/pile-of-law", "description": "We curate a large corpus of legal and administrative data. The utility of this data is twofold: (1) to aggregate legal and administrative data sources that demonstrate different norms and legal standards for data filtering; (2) to collect a dataset that can be used in the future for pretraining legal-domain language models, a key direction in access-to-justice initiatives.", "evaluation_metadata": {}}, "MLCommons/ml_spoken_words": {"name": "MLCommons/ml_spoken_words", "description": "Multilingual Spoken Words Corpus is a large and growing audio dataset of spoken\nwords in 50 languages collectively spoken by over 5 billion people, for academic\nresearch and commercial applications in keyword spotting and spoken term search,\nlicensed under CC-BY 4.0. The dataset contains more than 340,000 keywords,\ntotaling 23.4 million 1-second spoken examples (over 6,000 hours). The dataset\nhas many use cases, ranging from voice-enabled consumer devices to call center\nautomation. This dataset is generated by applying forced alignment on crowd-sourced sentence-level\naudio to produce per-word timing estimates for extraction.\nAll alignments are included in the dataset.", "evaluation_metadata": {}}, "projecte-aina/ancora-ca-ner": {"name": "projecte-aina/ancora-ca-ner", "description": "AnCora Catalan NER.\n This is a dataset for Named Eentity Reacognition (NER) from Ancora corpus adapted for \n Machine Learning and Language Model evaluation purposes.\n Since multiwords (including Named Entites) in the original Ancora corpus are aggregated as \n a single lexical item using underscores (e.g. \"Ajuntament_de_Barcelona\") \n we splitted them to align with word-per-line format, and added conventional Begin-Inside-Outside (IOB)\n tags to mark and classify Named Entites. \n We did not filter out the different categories of NEs from Ancora (weak and strong). \n We did 6 minor edits by hand.\n AnCora corpus is used under [CC-by] (https://creativecommons.org/licenses/by/4.0/) licence.\n This dataset was developed by BSC TeMU as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).", "evaluation_metadata": {}}, "projecte-aina/casum": {"name": "projecte-aina/casum", "description": "CaSum is a summarization dataset. It is extracted from a newswire corpus crawled from the Catalan News Agency. The corpus consists of 217,735 instances that are composed by the headline and the body.", "evaluation_metadata": {}}, "projecte-aina/catalan_general_crawling": {"name": "projecte-aina/catalan_general_crawling", "description": "The Catalan General Crawling Corpus is a 435-million-token web corpus of Catalan built from the web. It has been obtained by crawling the 500 most popular .cat and .ad domains during July 2020. It consists of 434.817.705 tokens, 19.451.691 sentences and 1.016.114 documents. Documents are separated by single new lines. It is a subcorpus of the Catalan Textual Corpus.", "evaluation_metadata": {}}, "projecte-aina/catalan_government_crawling": {"name": "projecte-aina/catalan_government_crawling", "description": "The Catalan Government Crawling Corpus is a 39-million-token web corpus of Catalan built from the web. It has been obtained by crawling the .gencat domain and subdomains, belonging to the Catalan Government during September and October 2020. It consists of 39.117.909 tokens, 1.565.433 sentences and 71.043 documents. Documents are separated by single new lines. It is a subcorpus of the Catalan Textual Corpus.", "evaluation_metadata": {}}, "projecte-aina/sts-ca": {"name": "projecte-aina/sts-ca", "description": "Semantic Textual Similarity in Catalan.\n STS corpus is a benchmark for evaluating Semantic Text Similarity in Catalan.\n It consists of more than 3000 sentence pairs, annotated with the semantic similarity between them, \n using a scale from 0 (no similarity at all) to 5 (semantic equivalence). \n It is done manually by 4 different annotators following our guidelines based on previous work from the SemEval challenges (https://www.aclweb.org/anthology/S13-1004.pdf).\n The source data are scraped sentences from the Catalan Textual Corpus (https://doi.org/10.5281/zenodo.4519349), used under CC-by-SA-4.0 licence (https://creativecommons.org/licenses/by-sa/4.0/). The dataset is released under the same licence.\n This dataset was developed by BSC TeMU as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).\n This is the version 1.0.2 of the dataset with the complete human and automatic annotations and the analysis scripts. It also has a more accurate license.\n This dataset can be used to build and score semantic similiarity models.", "evaluation_metadata": {}}, "projecte-aina/teca": {"name": "projecte-aina/teca", "description": "TECA consists of two subsets of textual entailment in Catalan, *catalan_TE1* and *vilaweb_TE*, which contain 14997 and 6166 pairs of premises and hypotheses, annotated according to the inference relation they have (implication, contradiction or neutral). This dataset was developed by BSC TeMU as part of the AINA project and intended as part of the Catalan Language Understanding Benchmark (CLUB).", "evaluation_metadata": {}}, "projecte-aina/tecla": {"name": "projecte-aina/tecla", "description": "TeCla: Text Classification Catalan dataset\n Catalan News corpus for Text classification, crawled from ACN (Catalan News Agency) site: www.acn.cat\n Corpus de not\u00edcies en catal\u00e0 per a classificaci\u00f3 textual, extret del web de l'Ag\u00e8ncia Catalana de Not\u00edcies - www.acn.cat", "evaluation_metadata": {}}, "projecte-aina/vilaquad": {"name": "projecte-aina/vilaquad", "description": "This dataset contains 2095 of Catalan language news articles along with 1 to 5 questions referring to each fragment (or context).\n\nVilaQuad articles are extracted from the daily Vilaweb (www.vilaweb.cat) and used under CC-by-nc-sa-nd (https://creativecommons.org/licenses/by-nc-nd/3.0/deed.ca) licence. \n\nThis dataset can be used to build extractive-QA and Language Models.\n\nFunded by the Generalitat de Catalunya, Departament de Pol\u00edtiques Digitals i Administraci\u00f3 P\u00fablica (AINA),\n\nMT4ALL and Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).", "evaluation_metadata": {}}, "projecte-aina/vilasum": {"name": "projecte-aina/vilasum", "description": "VilaSum is a summarization dataset for evaluation. It is extracted from a newswire corpus crawled from Vilaweb. The corpus consists of 13,843 instances that are composed by the headline and the body.", "evaluation_metadata": {}}, "projecte-aina/viquiquad": {"name": "projecte-aina/viquiquad", "description": "ViquiQuAD: an extractive QA dataset from Catalan Wikipedia.\nThis dataset contains 3111 contexts extracted from a set of 597 high quality original (no translations) \narticles in the Catalan Wikipedia \"Viquip\u00e8dia\" (ca.wikipedia.org), and 1 to 5 questions with their\nanswer for each fragment. Viquipedia articles are used under CC-by-sa licence. \nThis dataset can be used to build extractive-QA and Language Models.\nFunded by the Generalitat de Catalunya, Departament de Pol\u00edtiques Digitals i Administraci\u00f3 P\u00fablica (AINA),\nMT4ALL and Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).", "evaluation_metadata": {}}, "projecte-aina/wnli-ca": {"name": "projecte-aina/wnli-ca", "description": "professional translation into Catalan of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).", "evaluation_metadata": {}}, "projecte-aina/xquad-ca": {"name": "projecte-aina/xquad-ca", "description": "Professional translation into Catalan of XQuAD dataset (https://github.com/deepmind/xquad).\n XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating \n cross-lingual question answering performance. \n The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from \n the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with \n their professional translations into ten languages: \n Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi. \n Rumanian was added later.\n We added the 13th language to the corpus using also professional native catalan translators.\n XQuAD and XQuAD-Ca datasets are released under CC-by-sa licence.", "evaluation_metadata": {}}, "qanastek/WMT-16-PubMed": {"name": "qanastek/WMT-16-PubMed", "description": "WMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html", "evaluation_metadata": {}}, "qwant/squad_fr": {"name": "qwant/squad_fr", "description": "SQuAD-fr is a French translated version of the Stanford Question Answering Dataset (SQuAD), the reference corpus to evaluate question answering models' performances in English.\nIt consists of 100K question-answer pairs on 500+ articles derived from the original English dataset and represents a large-scale dataset for closed-domain question answering on factoid questions in French.\nSQuAD-fr serves as a means of data augmentation on FQuAD and PIAF benchmarks, with 90K+ translated training pairs.", "evaluation_metadata": {}}, "rahular/itihasa": {"name": "rahular/itihasa", "description": "A Sanskrit-English machine translation dataset.", "evaluation_metadata": {}}, "ramybaly/conll2012": {"name": "ramybaly/conll2012", "description": "The CoNLL-2012 shared task involved predicting coreference in English, Chinese, and Arabic, using the final version, v5.0,\nof the OntoNotes corpus. It was a follow-on to the English-only task organized in 2011. Until the creation of the OntoNotes\ncorpus, resources in this sub-field of language processing were limited to noun phrase coreference, often on a restricted\nset of entities, such as the ACE entities. OntoNotes provides a large-scale corpus of general anaphoric coreference not\nrestricted to noun phrases or to a specified set of entity types, and covers multiple languages. OntoNotes also provides\nadditional layers of integrated annotation, capturing additional shallow semantic structure. This paper describes the\nOntoNotes annotation (coreference and other layers) and then describes the parameters of the shared task including the\nformat, pre-processing information, evaluation criteria, and presents and discusses the results achieved by the participating\nsystems. The task of coreference has had a complex evaluation history. Potentially many evaluation conditions, have, in the past,\nmade it difficult to judge the improvement in new algorithms over previously reported results. Having a standard test set\nand standard evaluation parameters, all based on a resource that provides multiple integrated annotation layers (syntactic\nparses, semantic roles, word senses, named entities and coreference) and in multiple languages could support joint modeling\nand help ground and energize ongoing research in the task of entity and event coreference.\nFor more details see https://aclanthology.org/W12-4501.pdf", "evaluation_metadata": {}}, "ramybaly/nerd": {"name": "ramybaly/nerd", "description": "Recently, considerable literature has grown up around the theme of few-shot named entity recognition (NER), but little published benchmark\ndata specifically focused on the practical and challenging task. Current approaches collect existing supervised NER datasets and reorganize\nthem into the few-shot setting for empirical study. These strategies conventionally aim to recognize coarse-grained entity types with few\nexamples, while in practice, most unseen entity types are fine-grained. In this paper, we present FEW-NERD, a large-scale human-annotated\nfew-shot NER dataset with a hierarchy of 8 coarse-grained and 66 fine-grained entity types. FEW-NERD consists of 188,238 sentences from\nWikipedia, 4,601,160 words are included and each is annotated as context or a part of a two-level entity type. To the best of our knowledge,\nthis is the first few-shot NER dataset and the largest human-crafted NER dataset. We construct benchmark tasks with different emphases to\ncomprehensively assess the generalization capability of models. Extensive empirical results and analysis show that FEW-NERD is challenging\nand the problem requires further research. We make Few-NERD public at https://nigding97.github.io/fewnerd/", "evaluation_metadata": {}}, "roskoN/dailydialog": {"name": "roskoN/dailydialog", "description": "The DailyDialog dataset as provided in the original form with a bit of preprocessing applied to enable dast prototyping.\nThe splits are as in the original distribution.", "evaluation_metadata": {}}, "roskoN/dstc8-reddit-corpus": {"name": "roskoN/dstc8-reddit-corpus", "description": "The DSTC8 dataset as provided in the original form.\nThe only difference is that the splits are in separate zip files.\nIn the orignal output it is one big archive containing all splits.", "evaluation_metadata": {}}, "sagnikrayc/mctest": {"name": "sagnikrayc/mctest", "description": "MCTest requires machines to answer multiple-choice reading comprehension questions about fictional stories, directly tackling the high-level goal of open-domain machine comprehension.", "evaluation_metadata": {}}, "sagnikrayc/quasar": {"name": "sagnikrayc/quasar", "description": "We present two new large-scale datasets aimed at evaluating systems designed to comprehend a natural language query and extract its answer from a large corpus of text. The Quasar-S dataset consists of 37000 cloze-style (fill-in-the-gap) queries constructed from definitions of software entity tags on the popular website Stack Overflow. The posts and comments on the website serve as the background corpus for answering the cloze questions. The Quasar-T dataset consists of 43000 open-domain trivia questions and their answers obtained from various internet sources. ClueWeb09 serves as the background corpus for extracting these answers. We pose these datasets as a challenge for two related subtasks of factoid Question Answering: (1) searching for relevant pieces of text that include the correct answer to a query, and (2) reading the retrieved text to answer the query.", "evaluation_metadata": {}}, "sagteam/author_profiling": {"name": "sagteam/author_profiling", "description": "he corpus for the author profiling analysis contains texts in Russian-language which labeled for 5 tasks:\n1) gender -- 13530 texts with the labels, who wrote this: text female or male;\n2) age -- 13530 texts with the labels, how old the person who wrote the text. This is a number from 12 to 80. In addition, for the classification task we added 5 age groups: 1-19; 20-29; 30-39; 40-49; 50+;\n3) age imitation -- 7574 texts, where crowdsource authors is asked to write three texts: \n a) in their natural manner, \n b) imitating the style of someone younger, \n c) imitating the style of someone older;\n4) gender imitation -- 5956 texts, where the crowdsource authors is asked to write texts: in their origin gender and pretending to be the opposite gender;\n5) style imitation -- 5956 texts, where crowdsource authors is asked to write a text on behalf of another person of your own gender, with a distortion of the authors usual style.", "evaluation_metadata": {}}, "sebastiaan/test-cefr": {"name": "sebastiaan/test-cefr", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "semeru/completeformer-masked": {"name": "semeru/completeformer-masked", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "severo/wit": {"name": "severo/wit", "description": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset. WIT is composed of a curated set\n of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages. Its\n size enables WIT to be used as a pretraining dataset for multimodal machine learning models.", "evaluation_metadata": {}}, "shivmoha/squad-unanswerable": {"name": "shivmoha/squad-unanswerable", "description": "combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers\n to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but\n also determine when no answer is supported by the paragraph and abstain from answering.", "evaluation_metadata": {}}, "sil-ai/bloom-lm": {"name": "sil-ai/bloom-lm", "description": "This version of the Bloom Library data is developed specifically for the language modeling task.\nIt includes data from 484 languages across 39 language families, with many of the languages represented\nbeing extremely low resourced languages.", "evaluation_metadata": {}}, "sshleifer/pseudo_bart_xsum": {"name": "sshleifer/pseudo_bart_xsum", "description": "Extreme Summarization (XSum) Dataset.\n\nThere are two features:\n - document: Input news article.\n - summary: One sentence summary of the article.", "evaluation_metadata": {}}, "stas/c4-en-10k": {"name": "stas/c4-en-10k", "description": "This is a small subset representing the first 10K records of the original C4 dataset, \"en\" subset - created for testing. The records were extracted after having been shuffled.\n\nThe full 1TB+ dataset is at https://huggingface.co/datasets/c4.", "evaluation_metadata": {}}, "stas/openwebtext-10k": {"name": "stas/openwebtext-10k", "description": "An open-source replication of the WebText dataset from OpenAI.\n\nThis is a small subset representing the first 10K records from the original dataset - created for testing.\n\nThe full 8M-record dataset is at https://huggingface.co/datasets/openwebtext", "evaluation_metadata": {}}, "stas/oscar-en-10k": {"name": "stas/oscar-en-10k", "description": "This is a small subset representing 10K records from the original OSCAR dataset, \"unshuffled_deduplicated_en\" subset - created for testing. The records were extracted after having been shuffled.\n\nThe full 1TB+ dataset is at https://huggingface.co/datasets/oscar.", "evaluation_metadata": {}}, "susumu2357/squad_v2_sv": {"name": "susumu2357/squad_v2_sv", "description": "SQuAD_v2_sv is a Swedish version of SQuAD2.0. Translation was done automatically by using Google Translate API but it is not so straightforward because;\n\n1. the span which determines the start and the end of the answer in the context may vary after translation,\n2. tne translated context may not contain the translated answer if we translate both independently.\n\nMore details on how to handle these will be provided in another blog post.", "evaluation_metadata": {}}, "svanhvit/iceErrorCorpus": {"name": "svanhvit/iceErrorCorpus", "description": "Icelandic GEC corpus. \n\nThe Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre, if which there are three: student essays, online news texts and Icelandic Wikipedia articles. Each mistake is marked according to error type using an error code, of which there are 253. The corpus consists of 4,046 texts with 56,956 categorized error instances. The corpus is divided into a development corpus, which comprises 90% of the corpus, and a test corpus, which comprises the other 10% of the corpus.", "evaluation_metadata": {}}, "tanfiona/causenet_wiki": {"name": "tanfiona/causenet_wiki", "description": "Crawled Wikipedia Data from CIKM 2020 paper \n'CauseNet: Towards a Causality Graph Extracted from the Web.'", "evaluation_metadata": {}}, "tau/mrqa": {"name": "tau/mrqa", "description": "The MRQA 2019 Shared Task focuses on generalization in question answering.\nAn effective question answering system should do more than merely\ninterpolate from the training set to answer test examples drawn\nfrom the same distribution: it should also be able to extrapolate\nto out-of-distribution examples \u2014 a significantly harder challenge.\nThe dataset is a collection of 18 existing QA dataset (carefully selected\nsubset of them) and converted to the same format (SQuAD format). Among\nthese 18 datasets, six datasets were made available for training,\nsix datasets were made available for development, and the final six\nfor testing. The dataset is released as part of the MRQA 2019 Shared Task.", "evaluation_metadata": {}}, "tau/scrolls": {"name": "tau/scrolls", "description": "SCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/", "evaluation_metadata": {}}, "toloka/CrowdSpeech": {"name": "toloka/CrowdSpeech", "description": "CrowdSpeech is a publicly available large-scale dataset of crowdsourced audio transcriptions. It contains annotations for more than 50 hours of English speech transcriptions from more than 1,000 crowd workers.", "evaluation_metadata": {}}, "toloka/VoxDIY-RusNews": {"name": "toloka/VoxDIY-RusNews", "description": "VoxDIY: Benchmark Dataset for Russian Crowdsourced Audio Transcription.", "evaluation_metadata": {}}, "turingbench/TuringBench": {"name": "turingbench/TuringBench", "description": "This benchmark environment contains a dataset comprised of generated texts from pre-trained language models.\nWe also have two benchmark tasks - human vs. machine (i.e., binary classification) and authorship\nattribution (i.e., multi-class classification). These benchmark tasks and dataset are hosted on the\nTuringBench website with Leaderboards for each task.", "evaluation_metadata": {}}, "uit-nlp/vietnamese_students_feedback": {"name": "uit-nlp/vietnamese_students_feedback", "description": "Students\u2019 feedback is a vital resource for the interdisciplinary research involving the combining of two different\nresearch fields between sentiment analysis and education.\n\nVietnamese Students\u2019 Feedback Corpus (UIT-VSFC) is the resource consists of over 16,000 sentences which are\nhuman-annotated with two different tasks: sentiment-based and topic-based classifications.\n\nTo assess the quality of our corpus, we measure the annotator agreements and classification evaluation on the\nUIT-VSFC corpus. As a result, we obtained the inter-annotator agreement of sentiments and topics with more than over\n91% and 71% respectively. In addition, we built the baseline model with the Maximum Entropy classifier and achieved\napproximately 88% of the sentiment F1-score and over 84% of the topic F1-score.", "evaluation_metadata": {}}, "usc-isi/WikiConvert": {"name": "usc-isi/WikiConvert", "description": "Language Modelling with Cardinal Number Annotations.", "evaluation_metadata": {}}, "uva-irlab/canard_quretec": {"name": "uva-irlab/canard_quretec", "description": "CANARD has been preprocessed by Voskarides et al. to train and evaluate their Query Resolution Term Classification\nmodel (QuReTeC).\n\nCANARD is a dataset for question-in-context rewriting that consists of questions each given in a dialog context\ntogether with a context-independent rewriting of the question. The context of each question is the dialog utterences\nthat precede the question. CANARD can be used to evaluate question rewriting models that handle important linguistic\nphenomena such as coreference and ellipsis resolution.", "evaluation_metadata": {}}, "uva-irlab/trec-cast-2019-multi-turn": {"name": "uva-irlab/trec-cast-2019-multi-turn", "description": "The Conversational Assistance Track (CAsT) is a new track for TREC 2019 to facilitate Conversational Information \nSeeking (CIS) research and to create a large-scale reusable test collection for conversational search systems. \nThe document corpus is 38,426,252 passages from the TREC Complex Answer Retrieval (CAR) and Microsoft MAchine \nReading COmprehension (MARCO) datasets.", "evaluation_metadata": {}}, "vblagoje/wikipedia_snippets_streamed": {"name": "vblagoje/wikipedia_snippets_streamed", "description": "The dataset was built from the Wikipedia dump (https://dumps.wikimedia.org/).\nEach example contains the content of one full Wikipedia article with cleaning to strip\nmarkdown and unwanted sections (references, etc.).", "evaluation_metadata": {}}, "w11wo/imdb-javanese": {"name": "w11wo/imdb-javanese", "description": "Large Movie Review Dataset translated to Javanese.\r\nThis is a dataset for binary sentiment classification containing substantially\r\nmore data than previous benchmark datasets. We provide a set of 25,000 highly\r\npolar movie reviews for training, and 25,000 for testing. There is additional\r\nunlabeled data for use as well. We translated the original IMDB Dataset to\r\nJavanese using the multi-lingual MarianMT Transformer model from\r\n`Helsinki-NLP/opus-mt-en-mul`.", "evaluation_metadata": {}}, "wardenga/lsoie": {"name": "wardenga/lsoie", "description": "The Large Scale Open Information Extraction Dataset (LSOIE), is a dataset 20 \ntimes larger than the next largest human-annotated Open Information Extraction\n(OIE) dataset. LSOIE is a built upon the QA-SRL 2.0 dataset.", "evaluation_metadata": {}}, "webis/args_me": {"name": "webis/args_me", "description": "The args.me corpus (version 1.0, cleaned) comprises 382 545 arguments crawled from four debate portals in the middle of 2019. The debate portals are Debatewise, IDebate.org, Debatepedia, and Debate.org. The arguments are extracted using heuristics that are designed for each debate portal.", "evaluation_metadata": {}}, "webis/conclugen": {"name": "webis/conclugen", "description": "The ConcluGen corpus is constructed for the task of argument summarization. It consists of 136,996 pairs of argumentative texts and their conclusions collected from the ChangeMyView subreddit, a web portal for argumentative discussions on controversial topics.\n\nThe corpus has three variants: aspects, topics, and targets. Each variation encodes the corresponding information via control codes. These provide additional argumentative knowledge for generating more informative conclusions.", "evaluation_metadata": {}}, "wietsedv/stsbenchmark": {"name": "wietsedv/stsbenchmark", "description": "STS Benchmark comprises a selection of the English datasets used in the STS tasks organized in the context of SemEval between 2012 and 2017. The selection of datasets include text from image captions, news headlines and user forums.", "evaluation_metadata": {}}, "yhavinga/mc4_nl_cleaned": {"name": "yhavinga/mc4_nl_cleaned", "description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "evaluation_metadata": {}}, "yuanchuan/annotated_reference_strings": {"name": "yuanchuan/annotated_reference_strings", "description": "A repository of reference strings annotated using CSL processor using citations obtained from various sources.", "evaluation_metadata": {}}, "zhufy/xquad_split": {"name": "zhufy/xquad_split", "description": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering\nperformance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set\nof SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations into ten languages: Spanish, German,\nGreek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, Hindi and Romanian. Consequently, the dataset is entirely parallel\nacross 12 languages.", "evaluation_metadata": {}}, "nlpaueb/finer-139": {"name": "nlpaueb/finer-139", "description": "FiNER-139 is a named entity recognition dataset consisting of 10K annual \nand quarterly English reports (filings) of publicly traded companies \ndownloaded from the U.S. Securities and Exchange Commission (SEC) \nannotated with 139 XBRL tags in the IOB2 format.", "evaluation_metadata": {}}, "Alvenir/alvenir_asr_da_eval": {"name": "Alvenir/alvenir_asr_da_eval", "description": "Dataset of a little bit more than 5hours primarily intended as an evaluation dataset for Danish.", "evaluation_metadata": {}}, "google/xtreme_s": {"name": "google/xtreme_s", "description": "XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval. Covering 102\nlanguages from 10+ language families, 3 different domains and 4\ntask families, XTREME-S aims to simplify multilingual speech\nrepresentation evaluation, as well as catalyze research in \u201cuniversal\u201d speech representation learning.", "evaluation_metadata": {}}, "elkarhizketak": {"name": "elkarhizketak", "description": "ElkarHizketak is a low resource conversational Question Answering\n(QA) dataset in Basque created by Basque speaker volunteers. The\ndataset contains close to 400 dialogues and more than 1600 question\nand answers, and its small size presents a realistic low-resource\nscenario for conversational QA systems. The dataset is built on top of\nWikipedia sections about popular people and organizations. The\ndialogues involve two crowd workers: (1) a student ask questions after\nreading a small introduction about the person, but without seeing the\nsection text; and (2) a teacher answers the questions selecting a span\nof text of the section.", "evaluation_metadata": {}}, "ruanchaves/hashset_distant_sampled": {"name": "ruanchaves/hashset_distant_sampled", "description": "Hashset is a new dataset consisiting on 1.9k manually annotated and 3.3M loosely supervised tweets for testing the \r\nefficiency of hashtag segmentation models. We compare State of The Art Hashtag Segmentation models on Hashset and other \r\nbaseline datasets (STAN and BOUN). We compare and analyse the results across the datasets to argue that HashSet can act \r\nas a good benchmark for hashtag segmentation tasks.\r\n\r\nHashSet Distant: 3.3M loosely collected camel cased hashtags containing hashtag and their segmentation.\r\n\r\nHashSet Distant Sampled is a sample of 20,000 camel cased hashtags from the HashSet Distant dataset.", "evaluation_metadata": {}}, "ruanchaves/hashset_distant": {"name": "ruanchaves/hashset_distant", "description": "Hashset is a new dataset consisiting on 1.9k manually annotated and 3.3M loosely supervised tweets for testing the \r\nefficiency of hashtag segmentation models. We compare State of The Art Hashtag Segmentation models on Hashset and other \r\nbaseline datasets (STAN and BOUN). We compare and analyse the results across the datasets to argue that HashSet can act \r\nas a good benchmark for hashtag segmentation tasks.\r\n\r\nHashSet Distant: 3.3M loosely collected camel cased hashtags containing hashtag and their segmentation.", "evaluation_metadata": {}}, "ruanchaves/stan_large": {"name": "ruanchaves/stan_large", "description": "The description below was taken from the paper \"Multi-task Pairwise Neural Ranking for Hashtag Segmentation\"\r\nby Maddela et al..\r\n\r\n\"STAN large, our new expert curated dataset, which includes all 12,594 unique English hashtags and their \r\nassociated tweets from the same Stanford dataset.\r\n\r\nSTAN small is the most commonly used dataset in previous work. However, after reexamination, we found annotation \r\nerrors in 6.8% of the hashtags in this dataset, which is significant given that the error rate of the state-of-the art \r\nmodels is only around 10%. Most of the errors were related to named entities. For example, #lionhead, \r\nwhich refers to the \u201cLionhead\u201d video game company, was labeled as \u201clion head\u201d.\r\n\r\nWe therefore constructed the STAN large dataset of 12,594 hashtags with additional quality control for human annotations.\"", "evaluation_metadata": {}}, "ruanchaves/stan_small": {"name": "ruanchaves/stan_small", "description": "Manually Annotated Stanford Sentiment Analysis Dataset by Bansal et al..", "evaluation_metadata": {}}, "ruanchaves/boun": {"name": "ruanchaves/boun", "description": "Dev-BOUN Development set that includes 500 manually segmented hashtags. These are selected from tweets about movies, \r\ntv shows, popular people, sports teams etc. Test-BOUN Test set that includes 500 manually segmented hashtags. \r\nThese are selected from tweets about movies, tv shows, popular people, sports teams etc.", "evaluation_metadata": {}}, "ruanchaves/dev_stanford": {"name": "ruanchaves/dev_stanford", "description": "1000 hashtags manually segmented by \u00c7elebi et al. for development purposes, \r\nrandomly selected from the Stanford Sentiment Tweet Corpus by Sentiment140.", "evaluation_metadata": {}}, "ruanchaves/test_stanford": {"name": "ruanchaves/test_stanford", "description": "Manually Annotated Stanford Sentiment Analysis Dataset by Bansal et al..", "evaluation_metadata": {}}, "ruanchaves/nru_hse": {"name": "ruanchaves/nru_hse", "description": "2000 real hashtags collected from several pages about civil services on vk.com (a Russian social network) \nand then segmented manually.", "evaluation_metadata": {}}, "ruanchaves/loyola": {"name": "ruanchaves/loyola", "description": "In programming languages, identifiers are tokens (also called symbols) which name language entities.\r\nSome of the kinds of entities an identifier might denote include variables, types, labels, subroutines, and packages.\r\n\r\nThe Loyola University of Delaware Identifier Splitting Oracle is a dataset for identifier segmentation, \r\ni.e. the task of adding spaces between the words on a identifier.", "evaluation_metadata": {}}, "mbartolo/synQA": {"name": "mbartolo/synQA", "description": "SynQA is a Reading Comprehension dataset created in the work \"Improving Question Answering Model Robustness with Synthetic Adversarial Data Generation\" (https://aclanthology.org/2021.emnlp-main.696/).\nIt consists of 314,811 synthetically generated questions on the passages in the SQuAD v1.1 (https://arxiv.org/abs/1606.05250) training set.\n\nIn this work, we use a synthetic adversarial data generation to make QA models more robust to human adversaries. We develop a data generation pipeline that selects source passages, identifies candidate answers, generates questions, then finally filters or re-labels them to improve quality. Using this approach, we amplify a smaller human-written adversarial dataset to a much larger set of synthetic question-answer pairs. By incorporating our synthetic data, we improve the state-of-the-art on the AdversarialQA (https://adversarialqa.github.io/) dataset by 3.7F1 and improve model generalisation on nine of the twelve MRQA datasets. We further conduct a novel human-in-the-loop evaluation to show that our models are considerably more robust to new human-written adversarial examples: crowdworkers can fool our model only 8.8% of the time on average, compared to 17.6% for a model trained without synthetic data.\n\nFor full details on how the dataset was created, kindly refer to the paper.", "evaluation_metadata": {}}, "ruanchaves/bt11": {"name": "ruanchaves/bt11", "description": "In programming languages, identifiers are tokens (also called symbols) which name language entities.\r\nSome of the kinds of entities an identifier might denote include variables, types, labels, subroutines, and packages.\r\n\r\nBT11 is a dataset for identifier segmentation, \r\ni.e. the task of adding spaces between the words on a identifier.", "evaluation_metadata": {}}, "ruanchaves/binkley": {"name": "ruanchaves/binkley", "description": "In programming languages, identifiers are tokens (also called symbols) which name language entities.\r\nSome of the kinds of entities an identifier might denote include variables, types, labels, subroutines, and packages.\r\n\r\nBinkley is a dataset for identifier segmentation, \r\ni.e. the task of adding spaces between the words on a identifier.", "evaluation_metadata": {}}, "ruanchaves/jhotdraw": {"name": "ruanchaves/jhotdraw", "description": "In programming languages, identifiers are tokens (also called symbols) which name language entities.\r\nSome of the kinds of entities an identifier might denote include variables, types, labels, subroutines, and packages.\r\n\r\nJhotdraw is a dataset for identifier segmentation, \r\ni.e. the task of adding spaces between the words on a identifier.", "evaluation_metadata": {}}, "ruanchaves/lynx": {"name": "ruanchaves/lynx", "description": "In programming languages, identifiers are tokens (also called symbols) which name language entities.\r\nSome of the kinds of entities an identifier might denote include variables, types, labels, subroutines, and packages.\r\n\r\nLynx is a dataset for identifier segmentation, \r\ni.e. the task of adding spaces between the words on a identifier.", "evaluation_metadata": {}}, "ruanchaves/snap": {"name": "ruanchaves/snap", "description": "Automatically segmented 803K SNAP Twitter Data Set hashtags with the heuristic described in the paper \"Segmenting hashtags using automatically created training data\".", "evaluation_metadata": {}}, "SocialGrep/the-antiwork-subreddit-dataset": {"name": "SocialGrep/the-antiwork-subreddit-dataset", "description": "This dataset follows the notorious subreddit /r/Antiwork, a place for many Redditors to share resources and discuss grievances with the current labour market.", "evaluation_metadata": {}}, "fmplaza/EmoEvent": {"name": "fmplaza/EmoEvent", "description": "EmoEvent is a multilingual emotion dataset of tweets based on different events that took place in April 2019. \nThree annotators labeled the tweets following the six Ekman\u2019s basic emotion model (anger, fear, sadness, joy, disgust, surprise) plus the \u201cneutral or other emotions\u201d category.", "evaluation_metadata": {}}, "ai4bharat/IndicParaphrase": {"name": "ai4bharat/IndicParaphrase", "description": "This is the paraphrasing dataset released as part of IndicNLG Suite. Each \ninput is paired with up to 5 references. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 5.57M.", "evaluation_metadata": {}}, "nthngdy/oscar-mini": {"name": "nthngdy/oscar-mini", "description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.\\", "evaluation_metadata": {}}, "drAbreu/bc4chemd_ner": {"name": "drAbreu/bc4chemd_ner", "description": "The automatic extraction of chemical information from text requires the recognition of chemical entity mentions as one of its key steps. When developing supervised named entity recognition (NER) systems, the availability of a large, manually annotated text corpus is desirable. Furthermore, large corpora permit the robust evaluation and comparison of different approaches that detect chemicals in documents. We present the CHEMDNER corpus, a collection of 10,000 PubMed abstracts that contain a total of 84,355 chemical entity mentions labeled manually by expert chemistry literature curators, following annotation guidelines specifically defined for this task. The abstracts of the CHEMDNER corpus were selected to be representative for all major chemical disciplines. Each of the chemical entity mentions was manually labeled according to its structure-associated chemical entity mention (SACEM) class: abbreviation, family, formula, identifier, multiple, systematic and trivial. The difficulty and consistency of tagging chemicals in text was measured using an agreement study between annotators, obtaining a percentage agreement of 91. For a subset of the CHEMDNER corpus (the test set of 3,000 abstracts) we provide not only the Gold Standard manual annotations, but also mentions automatically detected by the 26 teams that participated in the BioCreative IV CHEMDNER chemical mention recognition task. In addition, we release the CHEMDNER silver standard corpus of automatically extracted mentions from 17,000 randomly selected PubMed abstracts. A version of the CHEMDNER corpus in the BioC format has been generated as well. We propose a standard for required minimum information about entity annotations for the construction of domain specific corpora on chemical and drug entities. The CHEMDNER corpus and annotation guidelines are available at: http://www.biocreative.org/resources/biocreative-iv/chemdner-corpus/", "evaluation_metadata": {}}, "Non-Residual-Prompting/C2Gen": {"name": "Non-Residual-Prompting/C2Gen", "description": "The task of C2Gen is to both generate commonsensical text which include the given words, and also have the generated text adhere to the given context.", "evaluation_metadata": {}}, "CLUTRR/v1": {"name": "CLUTRR/v1", "description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.", "evaluation_metadata": {}}, "PaddlePaddle/dureader_robust": {"name": "PaddlePaddle/dureader_robust", "description": "DureaderRobust is a chinese reading comprehension dataset, designed to evaluate the MRC models from three aspects: over-sensitivity, over-stability and generalization.", "evaluation_metadata": {}}, "ai4bharat/IndicHeadlineGeneration": {"name": "ai4bharat/IndicHeadlineGeneration", "description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.", "evaluation_metadata": {}}, "ai4bharat/IndicSentenceSummarization": {"name": "ai4bharat/IndicSentenceSummarization", "description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.", "evaluation_metadata": {}}, "ai4bharat/IndicWikiBio": {"name": "ai4bharat/IndicWikiBio", "description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.", "evaluation_metadata": {}}, "ai4bharat/IndicQuestionGeneration": {"name": "ai4bharat/IndicQuestionGeneration", "description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.", "evaluation_metadata": {}}, "Mulin/sg-holiday": {"name": "Mulin/sg-holiday", "description": "This news dataset is a holiday information of singapore from 2017 to 2022.", "evaluation_metadata": {}}, "ruanchaves/reddit_china": {"name": "ruanchaves/reddit_china", "description": "Reddit comments with the word 'China' between 2010 and 2022.", "evaluation_metadata": {}}, "McGill-NLP/feedbackQA": {"name": "McGill-NLP/feedbackQA", "description": "FeedbackQA is a retrieval-based QA dataset that contains interactive feedback from users. It has two parts: the first part contains a conventional RQA dataset, whilst this repo contains the second part, which contains feedback(ratings and natural language explanations) for QA pairs.", "evaluation_metadata": {}}, "gigant/horse2zebra": {"name": "gigant/horse2zebra", "description": "Two unpaired sets of photos of respectively horses and zebras, designed for unpaired image-to-image translation, as seen in the paper introducing CycleGAN", "evaluation_metadata": {}}, "jglaser/protein_ligand_contacts": {"name": "jglaser/protein_ligand_contacts", "description": "A dataset to fine-tune language models on protein-ligand binding affinity and contact prediction.", "evaluation_metadata": {}}, "lewtun/top_quark_tagging": {"name": "lewtun/top_quark_tagging", "description": "Top Quark Tagging is a dataset of Monte Carlo simulated hadronic top and QCD dijet events for the evaluation of top quark tagging architectures. The dataset consists of 1.2M training events, 400k validation events and 400k test events.", "evaluation_metadata": {}}, "wikitablequestions": {"name": "wikitablequestions", "description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.", "evaluation_metadata": {}}, "marsyas/gtzan": {"name": "marsyas/gtzan", "description": "GTZAN is a dataset for musical genre classification of audio signals. The dataset consists of 1,000 audio tracks, each of 30 seconds long. It contains 10 genres, each represented by 100 tracks. The tracks are all 22,050Hz Mono 16-bit audio files in WAV format. The genres are: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, and rock.", "evaluation_metadata": {}}, "GEM/xwikis": {"name": "GEM/xwikis", "description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.", "evaluation_metadata": {}}, "oscar-corpus/OSCAR-2201": {"name": "oscar-corpus/OSCAR-2201", "description": "The Open Super-large Crawled Aggregated coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the Ungoliant architecture.\\", "evaluation_metadata": {}}, "cgarciae/cartoonset": {"name": "cgarciae/cartoonset", "description": "Cartoon Set is a collection of random, 2D cartoon avatar images. The cartoons vary in 10 artwork \ncategories, 4 color categories, and 4 proportion categories, with a total of ~1013 possible \ncombinations. We provide sets of 10k and 100k randomly chosen cartoons and labeled attributes.", "evaluation_metadata": {}}, "conll2012_ontonotesv5": {"name": "conll2012_ontonotesv5", "description": "OntoNotes v5.0 is the final version of OntoNotes corpus, and is a large-scale, multi-genre,\nmultilingual corpus manually annotated with syntactic, semantic and discourse information.\n\nThis dataset is the version of OntoNotes v5.0 extended and is used in the CoNLL-2012 shared task.\nIt includes v4 train/dev and v9 test data for English/Chinese/Arabic and corrected version v12 train/dev/test data (English only).\n\nThe source of data is the Mendeley Data repo [ontonotes-conll2012](https://data.mendeley.com/datasets/zmycy7t9h9), which seems to be as the same as the official data, but users should use this dataset on their own responsibility.\n\nSee also summaries from paperwithcode, [OntoNotes 5.0](https://paperswithcode.com/dataset/ontonotes-5-0) and [CoNLL-2012](https://paperswithcode.com/dataset/conll-2012-1)\n\nFor more detailed info of the dataset like annotation, tag set, etc., you can refer to the documents in the Mendeley repo mentioned above.", "evaluation_metadata": {}}, "malteos/paperswithcode-aspects": {"name": "malteos/paperswithcode-aspects", "description": "Papers with aspects from paperswithcode.com dataset", "evaluation_metadata": {}}, "shpotes/ImVisible": {"name": "shpotes/ImVisible", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "shivam/split-test": {"name": "shivam/split-test", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "umanlp/xscitldr": {"name": "umanlp/xscitldr", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "cfilt/iwn_wordlists": {"name": "cfilt/iwn_wordlists", "description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.", "evaluation_metadata": {}}, "yhavinga/ccmatrix": {"name": "yhavinga/ccmatrix", "description": "CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB\n\nWe show that margin-based bitext mining in LASER's multilingual sentence space can be applied to\nmonolingual corpora of billions of sentences to produce high quality aligned translation data.\nWe use thirty-two snapshots of a curated common crawl corpus [1] totaling 69 billion unique sentences.\nUsing one unified approach for 80 languages, we were able to mine 10.8 billion parallel sentences,\nout of which only 2.9 billion are aligned with English.\n\nIMPORTANT: Please cite reference [2][3] if you use this data.\n\n[1] Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzm\u00e1n, Armand Jouli\n and Edouard Grave, CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data\n\n[2] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin,\n CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB\n\n[3] Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines,\n Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky,\n Sergey Edunov, Edouard Grave, Michael Auli, and Armand Joulin.\n Beyond English-Centric Multilingual Machine Translation\n \n90 languages, 1,197 bitexts\ntotal number of files: 90\ntotal number of tokens: 112.14G\ntotal number of sentence fragments: 7.37G", "evaluation_metadata": {}}, "TomTBT/pmc_open_access_xml": {"name": "TomTBT/pmc_open_access_xml", "description": "The PMC Open Access Subset includes more than 3.4 million journal articles and preprints that are made available under\nlicense terms that allow reuse. \nNot all articles in PMC are available for text mining and other reuse, many have copyright protection, however articles\nin the PMC Open Access Subset are made available under Creative Commons or similar licenses that generally allow more\nliberal redistribution and reuse than a traditional copyrighted work. \nThe PMC Open Access Subset is one part of the PMC Article Datasets\n\nThis version takes XML version as source, benefiting from the structured text\nto split the articles in parts, naming the introduction, methods, results,\ndiscussion and conclusion, and refers with keywords in the text to external or internal\nresources (articles, figures, tables, formulas, boxed-text, quotes, code, footnotes, chemicals, graphics, medias).", "evaluation_metadata": {}}, "monash_tsf": {"name": "monash_tsf", "description": "Monash Time Series Forecasting Repository which contains 30+ datasets of related time series for global forecasting research. This repository includes both real-world and competition time series datasets covering varied domains.", "evaluation_metadata": {}}, "nthngdy/oscar-small": {"name": "nthngdy/oscar-small", "description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.\\", "evaluation_metadata": {}}, "tau/multi_news": {"name": "tau/multi_news", "description": "Multi-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.", "evaluation_metadata": {}}, "tartuNLP/liv4ever": {"name": "tartuNLP/liv4ever", "description": "Livonian is one of the most endangered languages in Europe with just a tiny handful of speakers and virtually no publicly available corpora. \nIn this paper we tackle the task of developing neural machine translation (NMT) between Livonian and English, with a two-fold aim: on one hand, \npreserving the language and on the other \u2013 enabling access to Livonian folklore, lifestories and other textual intangible heritage as well as \nmaking it easier to create further parallel corpora. We rely on Livonian's linguistic similarity to Estonian and Latvian and collect parallel \nand monolingual data for the four languages for translation experiments. We combine different low-resource NMT techniques like zero-shot translation, \ncross-lingual transfer and synthetic data creation to reach the highest possible translation quality as well as to find which base languages are \nempirically more helpful for transfer to Livonian. The resulting NMT systems and the collected monolingual and parallel data, including a manually \ntranslated and verified translation benchmark, are publicly released.\n\nFields:\n- source: source of the data\n- en: sentence in English\n- liv: sentence in Livonian", "evaluation_metadata": {}}, "DFKI-SLT/scidtb": {"name": "DFKI-SLT/scidtb", "description": "Annotation corpus for discourse relations benefits NLP tasks such as machine translation and question\n answering. SciDTB is a domain-specific discourse treebank annotated on scientific articles.\n Different from widely-used RST-DT and PDTB, SciDTB uses dependency trees to represent discourse structure, which is\n flexible and simplified to some extent but do not sacrifice structural integrity. We discuss the labeling framework,\n annotation workflow and some statistics about SciDTB. Furthermore, our treebank is made as a benchmark for evaluating\n discourse dependency parsers, on which we provide several baselines as fundamental work.", "evaluation_metadata": {}}, "roman_urdu_hate_speech": {"name": "roman_urdu_hate_speech", "description": " The Roman Urdu Hate-Speech and Offensive Language Detection (RUHSOLD) dataset is a Roman Urdu dataset of tweets annotated by experts in the relevant language. The authors develop the gold-standard for two sub-tasks. First sub-task is based on binary labels of Hate-Offensive content and Normal content (i.e., inoffensive language). These labels are self-explanatory. The authors refer to this sub-task as coarse-grained classification. Second sub-task defines Hate-Offensive content with four labels at a granular level. These labels are the most relevant for the demographic of users who converse in RU and are defined in related literature. The authors refer to this sub-task as fine-grained classification. The objective behind creating two gold-standards is to enable the researchers to evaluate the hate speech detection approaches on both easier (coarse-grained) and challenging (fine-grained) scenarios. \\", "evaluation_metadata": {}}, "facebook/winoground": {"name": "facebook/winoground", "description": "Winoground is a novel task and dataset for evaluating the ability of vision and language models to conduct visio-linguistic compositional reasoning. Given two images and two captions, the goal is to match them correctly\u2014but crucially, both captions contain a completely identical set of words/morphemes, only in a different order. The dataset was carefully hand-curated by expert annotators and is labeled with a rich set of fine-grained tags to assist in analyzing model performance. In our accompanying paper, we probe a diverse range of state-of-the-art vision and language models and find that, surprisingly, none of them do much better than chance. Evidently, these models are not as skilled at visio-linguistic compositional reasoning as we might have hoped. In the paper, we perform an extensive analysis to obtain insights into how future work might try to mitigate these models\u2019 shortcomings. We aim for Winoground to serve as a useful evaluation set for advancing the state of the art and driving further progress in the field.", "evaluation_metadata": {}}, "jglaser/pdbbind_complexes": {"name": "jglaser/pdbbind_complexes", "description": "A dataset to fine-tune language models on protein-ligand binding affinity and contact prediction.", "evaluation_metadata": {}}, "adv_glue": {"name": "adv_glue", "description": "Adversarial GLUE Benchmark (AdvGLUE) is a comprehensive robustness evaluation benchmark\nthat focuses on the adversarial robustness evaluation of language models. It covers five\nnatural language understanding tasks from the famous GLUE tasks and is an adversarial\nversion of GLUE benchmark.", "evaluation_metadata": {}}, "copenlu/sufficient_facts": {"name": "copenlu/sufficient_facts", "description": "SufficientFacts is a diagnostic test dataset for fact checking with insufficient evidence.", "evaluation_metadata": {}}, "Samip/Scotch": {"name": "Samip/Scotch", "description": "Scotch is a dataset of about 19 million functions collected from open-source repositiories from GitHub with permissive licenses. Each function has its corresponding code context and about 4 million functions have corresponding docstrings. The dataset includes functions written in programming languages Python, Java, Javascript, and Go.", "evaluation_metadata": {}}, "KevinZ/psycholinguistic_eval": {"name": "KevinZ/psycholinguistic_eval", "description": "Psycholinguistic dataset from 'What BERT is not: Lessons from a new suite of psycholinguistic diagnostics for language models'\nby Allyson Ettinger", "evaluation_metadata": {}}, "huggingartists/olga-buzova": {"name": "huggingartists/olga-buzova", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "SocialGrep/the-reddit-dataset-dataset": {"name": "SocialGrep/the-reddit-dataset-dataset", "description": "A meta dataset of Reddit's own /r/datasets community.", "evaluation_metadata": {}}, "PolyAI/minds14": {"name": "PolyAI/minds14", "description": "MINDS-14 is training and evaluation resource for intent\ndetection task with spoken data. It covers 14\nintents extracted from a commercial system\nin the e-banking domain, associated with spoken examples in 14 diverse language varieties.", "evaluation_metadata": {}}, "SocialGrep/the-reddit-place-dataset": {"name": "SocialGrep/the-reddit-place-dataset", "description": "The written history or /r/Place, in posts and comments.", "evaluation_metadata": {}}, "StanBienaives/french-open-fiscal-texts": {"name": "StanBienaives/french-open-fiscal-texts", "description": " This dataset is an extraction from the OPENDATA/JADE. A list of case laws from the French court \"Conseil d'Etat\".", "evaluation_metadata": {}}, "ukr-models/Ukr-Synth": {"name": "ukr-models/Ukr-Synth", "description": "Large silver standard Ukrainian corpus annotated with morphology tags, syntax trees and PER, LOC, ORG NER-tags.", "evaluation_metadata": {}}, "skt/kobest_v1": {"name": "skt/kobest_v1", "description": " The dataset contains data for KoBEST dataset", "evaluation_metadata": {}}, "bergoliveira/pl-corpus": {"name": "bergoliveira/pl-corpus", "description": "PL-corpus is a Portuguese language dataset for named entity recognition applied to legislative documents. Its parte of the UlyssesBR-corpus, and consists entirely of manually annotated public bills texts (projetos de leis) and contains tags for persons, locations, date entities, organizations, legal foundation and bills.", "evaluation_metadata": {}}, "McGill-NLP/TopiOCQA": {"name": "McGill-NLP/TopiOCQA", "description": "TopiOCQA is an information-seeking conversational dataset with challenging topic switching phenomena.", "evaluation_metadata": {}}, "csebuetnlp/squad_bn": {"name": "csebuetnlp/squad_bn", "description": "SQuAD-bn is derived from the SQuAD-2.0 and TyDI-QA datasets.", "evaluation_metadata": {}}, "taln-ls2n/inspec": {"name": "taln-ls2n/inspec", "description": "Inspec benchmark dataset for keyphrase extraction an generation.", "evaluation_metadata": {}}, "gsm8k": {"name": "gsm8k", "description": "GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality\nlinguistically diverse grade school math word problems. The\ndataset was created to support the task of question answering\non basic mathematical problems that require multi-step reasoning.", "evaluation_metadata": {}}, "sbu_captions": {"name": "sbu_captions", "description": "The SBU Captioned Photo Dataset is a collection of over 1 million images with associated text descriptions extracted from Flicker.", "evaluation_metadata": {}}, "bullmount/squad_it": {"name": "bullmount/squad_it", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "taln-ls2n/kp20k": {"name": "taln-ls2n/kp20k", "description": "KP20k dataset for keyphrase extraction and generation in scientific paper.", "evaluation_metadata": {}}, "patriziobellan/PET": {"name": "patriziobellan/PET", "description": "Abstract. Although there is a long tradition of work in NLP on extracting entities and relations from text, to date there exists little work on the acquisition of business processes from unstructured data such as textual corpora of process descriptions. With this work we aim at filling this gap and establishing the first steps towards bridging data-driven information extraction methodologies from Natural Language Processing and the model-based formalization that is aimed from Business Process Management. For this, we develop the first corpus of business process descriptions annotated with activities, gateways, actors and flow information. We present our new resource, including a detailed overview of the annotation schema and guidelines, as well as a variety of baselines to benchmark the difficulty and challenges of business process extraction from text.", "evaluation_metadata": {}}, "conceptual_captions": {"name": "conceptual_captions", "description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.", "evaluation_metadata": {}}, "craffel/tasky_or_not": {"name": "craffel/tasky_or_not", "description": "This dataset is a collection of prompted examples from P3 and examples from C4.\nThe C4 examples are labeled \"not-task-like\" and the P3 examples are\n\"task-like\". Examples were sampled from C4 so that the distribution of example\nlengths is similar for C4 and P3 examples. Some datasets from P3 were ignored\nbecause their examples were too long. Some datasets from P3 are held out for\nvalidation. Non-tasky validation data was gathered from C4 without\nintentionally matching the length distribution. Tasky data was gathered from\nthe validation set of certain held-out datasets from P3.", "evaluation_metadata": {}}, "bullmount/squad-it-exp": {"name": "bullmount/squad-it-exp", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "conceptual_12m": {"name": "conceptual_12m", "description": "Conceptual 12M is a large-scale dataset of 12 million\nimage-text pairs specifically meant to be used for visionand-language pre-training.\nIts data collection pipeline is a relaxed version of the one used in Conceptual Captions 3M.", "evaluation_metadata": {}}, "surrey-nlp/PLOD-filtered": {"name": "surrey-nlp/PLOD-filtered", "description": "This is the dataset repository for PLOD Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Abbreviation Detection.", "evaluation_metadata": {}}, "surrey-nlp/PLOD-unfiltered": {"name": "surrey-nlp/PLOD-unfiltered", "description": "This is the dataset repository for PLOD Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Abbreviation Detection.", "evaluation_metadata": {}}, "Divyanshu/indicxnli": {"name": "Divyanshu/indicxnli", "description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).", "evaluation_metadata": {}}, "KevinZ/oLMpics": {"name": "KevinZ/oLMpics", "description": "This is a set a eight datasets from the paper \"oLMpics - On what Language Model Pre-training Captures\"\nby Alon Talmor et al.", "evaluation_metadata": {}}, "SocialGrep/the-reddit-irl-dataset": {"name": "SocialGrep/the-reddit-irl-dataset", "description": "Data from the humour subreddits /r/meirl and /r/me_irl, up to Apr 1 2022", "evaluation_metadata": {}}, "taln-ls2n/wikinews-fr-100": {"name": "taln-ls2n/wikinews-fr-100", "description": "Wikinews-fr-100 benchmark dataset for keyphrase extraction an generation.", "evaluation_metadata": {}}, "taln-ls2n/taln-archives": {"name": "taln-ls2n/taln-archives", "description": "TALN Archives benchmark dataset for keyphrase extraction an generation.", "evaluation_metadata": {}}, "arka0821/multi_document_summarization": {"name": "arka0821/multi_document_summarization", "description": "Multi-Document, a large-scale multi-document summarization dataset created from scientific articles. Multi-Document introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references.", "evaluation_metadata": {}}, "enoriega/biocreative_gene_mention": {"name": "enoriega/biocreative_gene_mention", "description": "Training and validation datasets for the BioCreative II gene mention task.\nThe data has been tokenized with [processors](https://github.com/clulab/processors)\n## Features:\n- __tokens__: Input token sequence\n- __folded_tokens__: Same as tokens, but case-folded\n- __tags__: POS tags of the input sequence tokens\n- __labels__: BIO sequence tags", "evaluation_metadata": {}}, "hapandya/sqnnr": {"name": "hapandya/sqnnr", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "NbAiLab/NST": {"name": "NbAiLab/NST", "description": "This database was created by Nordic Language Technology for the development of automatic speech recognition and dictation in Norwegian. In this version, the organization of the data have been altered to improve the usefulness of the database.\n\nThe acoustic databases described below were developed by the firm Nordisk spr\u00e5kteknologi holding AS (NST), which went bankrupt in 2003. In 2006, a consortium consisting of the University of Oslo, the University of Bergen, the Norwegian University of Science and Technology, the Norwegian Language Council and IBM bought the bankruptcy estate of NST, in order to ensure that the language resources developed by NST were preserved. In 2009, the Norwegian Ministry of Culture charged the National Library of Norway with the task of creating a Norwegian language bank, which they initiated in 2010. The resources from NST were transferred to the National Library in May 2011, and are now made available in Spr\u00e5kbanken, for the time being without any further modification. Spr\u00e5kbanken is open for feedback from users about how the resources can be improved, and we are also interested in improved versions of the databases that users wish to share with other users. Please send response and feedback to sprakbanken@nb.no.", "evaluation_metadata": {}}, "Yaxin/SemEval2016Task5Raw": {"name": "Yaxin/SemEval2016Task5Raw", "description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.", "evaluation_metadata": {}}, "mweiss/fashion_mnist_corrupted": {"name": "mweiss/fashion_mnist_corrupted", "description": "Fashion-MNIST is dataset of fashion images, indended as a drop-in replacement for the MNIST dataset.\nThis dataset (Fashion-Mnist-Corrupted) provides out-of-distribution data for the Fashion-Mnist\ndataset. Fashion-Mnist-Corrupted is based on a similar project for MNIST, called MNIST-C, by Mu et. al.", "evaluation_metadata": {}}, "visual_genome": {"name": "visual_genome", "description": "Visual Genome enable to model objects and relationships between objects.\nThey collect dense annotations of objects, attributes, and relationships within each image.\nSpecifically, the dataset contains over 108K images where each image has an average of 35 objects, 26 attributes, and 21 pairwise relationships between objects.", "evaluation_metadata": {}}, "Yaxin/SemEval2014Task4Raw": {"name": "Yaxin/SemEval2014Task4Raw", "description": "A collection of SemEval2014 specifically designed to aid research in Aspect Based Sentiment Analysis.", "evaluation_metadata": {}}, "Yaxin/SemEval2015Task12Raw": {"name": "Yaxin/SemEval2015Task12Raw", "description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.", "evaluation_metadata": {}}, "aharley/rvl_cdip": {"name": "aharley/rvl_cdip", "description": "The RVL-CDIP (Ryerson Vision Lab Complex Document Information Processing) dataset consists of 400,000 grayscale images in 16 classes, with 25,000 images per class. There are 320,000 training images, 40,000 validation images, and 40,000 test images.", "evaluation_metadata": {}}, "patrickvonplaten/librispeech_asr_self_contained": {"name": "patrickvonplaten/librispeech_asr_self_contained", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "adithya7/xlel_wd_dictionary": {"name": "adithya7/xlel_wd_dictionary", "description": "XLEL-WD is a multilingual event linking dataset. This sub-dataset contains a dictionary of events from Wikidata. The multilingual descriptions for Wikidata event items are taken from the corresponding Wikipedia articles.", "evaluation_metadata": {}}, "adithya7/xlel_wd": {"name": "adithya7/xlel_wd", "description": "XLEL-WD is a multilingual event linking dataset. This dataset contains mention references from multilingual Wikipedia/Wikinews articles to event items in Wikidata. The text descriptions for Wikidata events are compiled from Wikipedia articles.", "evaluation_metadata": {}}, "taln-ls2n/termith-eval": {"name": "taln-ls2n/termith-eval", "description": "TermITH-Eval benchmark dataset for keyphrase extraction an generation.", "evaluation_metadata": {}}, "cfilt/HiNER-collapsed": {"name": "cfilt/HiNER-collapsed", "description": "This is the repository for HiNER - a large Hindi Named Entity Recognition dataset.", "evaluation_metadata": {}}, "surrey-nlp/SDU-test": {"name": "surrey-nlp/SDU-test", "description": "This is the dataset repository for SDU Dataset from SDU workshop at AAAI22.\nThe dataset can help build sequence labelling models for the task Abbreviation Detection.", "evaluation_metadata": {}}, "taln-ls2n/semeval-2010-pre": {"name": "taln-ls2n/semeval-2010-pre", "description": "Preprocessed SemEval-2010 Benchmark dataset for Keyphrase Generation.", "evaluation_metadata": {}}, "qanastek/MASSIVE": {"name": "qanastek/MASSIVE", "description": "MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\nfor the Natural Language Understanding tasks of intent prediction and slot annotation.\nUtterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\nthe SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "evaluation_metadata": {}}, "McGill-NLP/FaithDial": {"name": "McGill-NLP/FaithDial", "description": "FaithDial is a new benchmark for hallucination-free dialogues, created by manually editing hallucinated and uncooperative responses in Wizard of Wikipedia.", "evaluation_metadata": {}}, "BritishLibraryLabs/web_archive_classification": {"name": "BritishLibraryLabs/web_archive_classification", "description": " The dataset comprises a manually curated selective archive produced by UKWA which includes the classification of sites into a two-tiered subject hierarchy.", "evaluation_metadata": {}}, "cfilt/HiNER-original": {"name": "cfilt/HiNER-original", "description": "This is the dataset repository for HiNER Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Named Entity Recognitin for the Hindi language.", "evaluation_metadata": {}}, "RohanAiLab/persian_blog_V2": {"name": "RohanAiLab/persian_blog_V2", "description": "persian_blog is a dataset consist of 700K blog posts from various websites and has types of tones.\nthis dataset can be used in different NLG tasks and as a show-case it's is used in training reformer-persian.", "evaluation_metadata": {}}, "BigScienceBiasEval/crows_pairs_multilingual": {"name": "BigScienceBiasEval/crows_pairs_multilingual", "description": "This is a revised version of CrowS-Pairs that measures stereotypes in language modelling in both English and French.", "evaluation_metadata": {}}, "khalidalt/HuffPost": {"name": "khalidalt/HuffPost", "description": "A dataset of approximately 200K news headlines from the year 2012 to 2018 collected from HuffPost.", "evaluation_metadata": {}}, "SocialGrep/the-reddit-nft-dataset": {"name": "SocialGrep/the-reddit-nft-dataset", "description": "A comprehensive dataset of Reddit's NFT discussion.", "evaluation_metadata": {}}, "aakanksha/udpos": {"name": "aakanksha/udpos", "description": "Universal Dependencies is an open community effort to create cross-linguistically consistent treebank annotation for many languages within a dependency-based lexicalist framework. The annotation consists in a linguistically motivated word segmentation; a morphological layer comprising lemmas, universal part-of-speech tags, and standardized morphological features; and a syntactic layer focusing on syntactic relations between predicates, arguments and modifiers.", "evaluation_metadata": {}}, "wza/TimeTravel": {"name": "wza/TimeTravel", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "PolyAI/banking77": {"name": "PolyAI/banking77", "description": "BANKING77 dataset provides a very fine-grained set of intents in a banking domain.\nIt comprises 13,083 customer service queries labeled with 77 intents.\nIt focuses on fine-grained single-domain intent detection.", "evaluation_metadata": {}}, "bigscience-catalogue-data/bias-shades": {"name": "bigscience-catalogue-data/bias-shades", "description": "This is a preliminary version of the bias SHADES dataset for evaluating LMs for social biases.", "evaluation_metadata": {}}, "AmazonScience/massive": {"name": "AmazonScience/massive", "description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "evaluation_metadata": {}}, "strombergnlp/broad_twitter_corpus": {"name": "strombergnlp/broad_twitter_corpus", "description": "This is the Broad Twitter corpus, a dataset of tweets collected over stratified times, places and social uses. \nThe goal is to represent a broad range of activities, giving a dataset more representative of the language used \nin this hardest of social media formats to process. Further, the BTC is annotated for named entities.\n\nFor more details see [https://aclanthology.org/C16-1111/](https://aclanthology.org/C16-1111/)", "evaluation_metadata": {}}, "strombergnlp/ipm_nel": {"name": "strombergnlp/ipm_nel", "description": "This data is for the task of named entity recognition and linking/disambiguation over tweets. It comprises\nthe addition of an entity URI layer on top of an NER-annotated tweet dataset. The task is to detect entities\nand then provide a correct link to them in DBpedia, thus disambiguating otherwise ambiguous entity surface\nforms; for example, this means linking \"Paris\" to the correct instance of a city named that (e.g. Paris, \nFrance vs. Paris, Texas).\n\nThe data concentrates on ten types of named entities: company, facility, geographic location, movie, musical\nartist, person, product, sports team, TV show, and other.\n\nThe file is tab separated, in CoNLL format, with line breaks between tweets.\nData preserves the tokenisation used in the Ritter datasets.\nPoS labels are not present for all tweets, but where they could be found in the Ritter\ndata, they're given. In cases where a URI could not be agreed, or was not present in\nDBpedia, there is a NIL. See the paper for a full description of the methodology.\n\nFor more details see http://www.derczynski.com/papers/ner_single.pdf or https://www.sciencedirect.com/science/article/abs/pii/S0306457314001034", "evaluation_metadata": {}}, "strombergnlp/shaj": {"name": "strombergnlp/shaj", "description": "This is an abusive/offensive language detection dataset for Albanian. The data is formatted\nfollowing the OffensEval convention, with three tasks:\n\n* Subtask A: Offensive (OFF) or not (NOT)\n* Subtask B: Untargeted (UNT) or targeted insult (TIN)\n* Subtask C: Type of target: individual (IND), group (GRP), or other (OTH)\n\n* The subtask A field should always be filled.\n* The subtask B field should only be filled if there's \"offensive\" (OFF) in A.\n* The subtask C field should only be filled if there's \"targeted\" (TIN) in B.\n\nThe dataset name is a backronym, also standing for \"Spoken Hate in the Albanian Jargon\"\n\nSee the paper [https://arxiv.org/abs/2107.13592](https://arxiv.org/abs/2107.13592) for full details.", "evaluation_metadata": {}}, "strombergnlp/dkstance": {"name": "strombergnlp/dkstance", "description": "This dataset presents a series of stories on Reddit and the conversation around\nthem, annotated for stance. Stories are also annotated for veracity.\n\nFor more details see https://aclanthology.org/W19-6122/", "evaluation_metadata": {}}, "strombergnlp/polstance": {"name": "strombergnlp/polstance", "description": "Political stance in Danish. Examples represent statements by \npoliticians and are annotated for, against, or neutral to a given topic/article.", "evaluation_metadata": {}}, "strombergnlp/bornholmsk": {"name": "strombergnlp/bornholmsk", "description": "This corpus introduces language processing resources and tools for Bornholmsk, a language spoken on the island of Bornholm, with roots in Danish and closely related to Scanian. \n\nSammenfattnijng p\u00e5 borrijnholmst: D\u00e6jnna artikkelijn introduserer naturspr\u00e5gsresurser \u00e5 varktoi for borrijnholmst, ed spr\u00e5g a d\u00e6r snakkes p\u00e5 \u00f6n Borrijnholm me r\u00f8dder i danst \u00e5 i n\u00e6r familia me sk\u00e5nst.", "evaluation_metadata": {}}, "strombergnlp/twitter_pos_vcb": {"name": "strombergnlp/twitter_pos_vcb", "description": "Part-of-speech information is basic NLP task. However, Twitter text\nis difficult to part-of-speech tag: it is noisy, with linguistic errors and idiosyncratic style.\nThis data is the vote-constrained bootstrapped data generate to support state-of-the-art results.\n\nThe data is about 1.5 million English tweets annotated for part-of-speech using Ritter's extension of the PTB tagset.\nThe tweets are from 2012 and 2013, tokenized using the GATE tokenizer and tagged\njointly using the CMU ARK tagger and Ritter's T-POS tagger. Only when both these taggers' outputs\nare completely compatible over a whole tweet, is that tweet added to the dataset.\n\nThis data is recommend for use a training data **only**, and not evaluation data.\n\nFor more details see https://gate.ac.uk/wiki/twitter-postagger.html and https://aclanthology.org/R13-1026.pdf", "evaluation_metadata": {}}, "strombergnlp/zulu_stance": {"name": "strombergnlp/zulu_stance", "description": "This is a stance detection dataset in the Zulu language. The data is translated to Zulu by Zulu native speakers, from English source texts.\n\nMisinformation has become a major concern in recent last years given its \nspread across our information sources. In the past years, many NLP tasks have\nbeen introduced in this area, with some systems reaching good results on \nEnglish language datasets. Existing AI based approaches for fighting \nmisinformation in literature suggest automatic stance detection as an integral\nfirst step to success. Our paper aims at utilizing this progress made for\nEnglish to transfers that knowledge into other languages, which is a \nnon-trivial task due to the domain gap between English and the target \nlanguages. We propose a black-box non-intrusive method that utilizes techniques\nfrom Domain Adaptation to reduce the domain gap, without requiring any human\nexpertise in the target language, by leveraging low-quality data in both a\nsupervised and unsupervised manner. This allows us to rapidly achieve similar\nresults for stance detection for the Zulu language, the target language in\nthis work, as are found for English. We also provide a stance detection dataset\nin the Zulu language.", "evaluation_metadata": {}}, "BigScienceBiasEval/bias-shades": {"name": "BigScienceBiasEval/bias-shades", "description": "This is a preliminary version of the bias SHADES dataset for evaluating LMs for social biases.", "evaluation_metadata": {}}, "muibk/wmt21_metrics_task": {"name": "muibk/wmt21_metrics_task", "description": "This shared task will examine automatic evaluation metrics for machine translation. We will \nprovide you with MT system outputs along with source text and the human reference translations. \nWe are looking for automatic metric scores for translations at the system-level, and segment-level. \nWe will calculate the system-level, and segment-level correlations of your scores with human judgements.\n\nWe invite submissions of reference-free metrics in addition to reference-based metrics.", "evaluation_metadata": {}}, "aps/dynahate": {"name": "aps/dynahate", "description": "We present a human-and-model-in-the-loop process for dynamically generating datasets and training better performing and more robust hate detection models. We provide a new dataset of ~40,000 entries, generated and labelled by trained annotators over four rounds of dynamic data creation. It includes ~15,000 challenging perturbations and each hateful entry has fine-grained labels for the type and target of hate. Hateful entries make up 54% of the dataset, which is substantially higher than comparable datasets. We show that model performance is substantially improved using this approach. Models trained on later rounds of data collection perform better on test sets and are harder for annotators to trick. They also perform better on HATECHECK, a suite of functional tests for online hate detection. See https://arxiv.org/abs/2012.15761 for more details.", "evaluation_metadata": {}}, "Filippo/osdg_cd": {"name": "Filippo/osdg_cd", "description": "The OSDG Community Dataset (OSDG-CD) is a public dataset of thousands of text excerpts, which were validated by approximately 1,000 OSDG Community Platform (OSDG-CP) citizen scientists from over 110 countries, with respect to the Sustainable Development Goals (SDGs).", "evaluation_metadata": {}}, "NazaGara/wikiner-es": {"name": "NazaGara/wikiner-es", "description": "Dataset used to train a NER model", "evaluation_metadata": {}}, "skg/toxigen-data": {"name": "skg/toxigen-data", "description": "Toxigen is a large-scale dataset containing implicitly toxic and benign sentences mentioning 13 minority groups, and a tool to stress test a given off-the-shelf toxicity classifier. The dataset is generated using a large language model (GPT3). It is intended to be used for training classifiers that learn to detect subtle hate speech that includes no slurs or profanity.", "evaluation_metadata": {}}, "google/wit": {"name": "google/wit", "description": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset.\nWIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages.\nIts size enables WIT to be used as a pretraining dataset for multimodal machine learning models.", "evaluation_metadata": {}}, "imagenet-1k": {"name": "imagenet-1k", "description": "ILSVRC 2012, commonly known as 'ImageNet' is an image dataset organized according to the WordNet hierarchy. Each meaningful concept in WordNet, possibly described by multiple words or word phrases, is called a \"synonym set\" or \"synset\". There are more than 100,000 synsets in WordNet, majority of them are nouns (80,000+). ImageNet aims to provide on average 1000 images to illustrate each synset. Images of each concept are quality-controlled and human-annotated. In its completion, ImageNet hopes to offer tens of millions of cleanly sorted images for most of the concepts in the WordNet hierarchy. ImageNet 2012 is the most commonly used subset of ImageNet. This dataset spans 1000 object classes and contains 1,281,167 training images, 50,000 validation images and 100,000 test images", "evaluation_metadata": {}}, "arbml/masader": {"name": "arbml/masader", "description": "Masader is the largest public catalogue for Arabic NLP datasets, which consists of more than 200 datasets annotated with 25 attributes.", "evaluation_metadata": {}}, "wza/roc_stories": {"name": "wza/roc_stories", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "tne": {"name": "tne", "description": "TNE is an NLU task, which focus on relations between noun phrases (NPs) that can be mediated via prepositions.\nThe dataset contains 5,497 documents, annotated exhaustively with all possible links between the NPs in each document.", "evaluation_metadata": {}}, "textvqa": {"name": "textvqa", "description": "TextVQA requires models to read and reason about text in images to answer questions about them.\nSpecifically, models need to incorporate a new modality of text present in the images and reason\nover it to answer TextVQA questions. TextVQA dataset contains 45,336 questions over 28,408 images\nfrom the OpenImages dataset.", "evaluation_metadata": {}}, "ett": {"name": "ett", "description": "The data of Electricity Transformers from two separated counties\nin China collected for two years at hourly and 15-min frequencies.\nEach data point consists of the target value \"oil temperature\" and\n6 power load features. The train/val/test is 12/4/4 months.", "evaluation_metadata": {}}, "sil-ai/bloom-vist": {"name": "sil-ai/bloom-vist", "description": "This version of the Bloom Library data is developed specifically for the Visual Story Telling (VIST) task.\nIt includes data from 363 languages across 36 language families, with many of the languages represented\nbeing extremely low resourced languages.", "evaluation_metadata": {}}, "ashraq/youtube-transcription": {"name": "ashraq/youtube-transcription", "description": "This is YouTube video transcription dataset built from YTTTS Speech Collection for semantic search.", "evaluation_metadata": {}}, "searle-j/kote": {"name": "searle-j/kote", "description": "50k Korean online comments labeled for 44 emotion categories.", "evaluation_metadata": {}}, "medmcqa": {"name": "medmcqa", "description": "MedMCQA is a large-scale, Multiple-Choice Question Answering (MCQA) dataset designed to address real-world medical entrance exam questions.\nMedMCQA has more than 194k high-quality AIIMS & NEET PG entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average token length of 12.77 and high topical diversity.\nThe dataset contains questions about the following topics: Anesthesia, Anatomy, Biochemistry, Dental, ENT, Forensic Medicine (FM)\nObstetrics and Gynecology (O&G), Medicine, Microbiology, Ophthalmology, Orthopedics Pathology, Pediatrics, Pharmacology, Physiology,\nPsychiatry, Radiology Skin, Preventive & Social Medicine (PSM) and Surgery", "evaluation_metadata": {}}, "filwsyl/video_tags": {"name": "filwsyl/video_tags", "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.", "evaluation_metadata": {}}, "taln-ls2n/kptimes": {"name": "taln-ls2n/kptimes", "description": "KPTimes benchmark dataset for keyphrase extraction an generation.", "evaluation_metadata": {}}, "polinaeterna/vox_lingua": {"name": "polinaeterna/vox_lingua", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "strombergnlp/twitter_pos": {"name": "strombergnlp/twitter_pos", "description": "Part-of-speech information is basic NLP task. However, Twitter text\nis difficult to part-of-speech tag: it is noisy, with linguistic errors and idiosyncratic style.\nThis dataset contains two datasets for English PoS tagging for tweets:\n\n* Ritter, with train/dev/test\n* Foster, with dev/test\n\nSplits defined in the Derczynski paper, but the data is from Ritter and Foster.\n\nFor more details see:\n\n* https://gate.ac.uk/wiki/twitter-postagger.html\n* https://aclanthology.org/D11-1141.pdf\n* https://www.aaai.org/ocs/index.php/ws/aaaiw11/paper/download/3912/4191", "evaluation_metadata": {}}, "Bingsu/arcalive_220506": {"name": "Bingsu/arcalive_220506", "description": "[\uc544\uce74\ub77c\uc774\ube0c \ubca0\uc2a4\ud2b8 \ub77c\uc774\ube0c \ucc44\ub110](https://arca.live/b/live)\uc758 2021\ub144 8\uc6d4 16\uc77c\ubd80\ud130 2022\ub144 5\uc6d4 6\uc77c\uae4c\uc9c0\uc758 \ub370\uc774\ud130\ub97c \uc218\uc9d1\ud558\uc5ec, \ub313\uae00\ub9cc \uace8\ub77c\ub0b8 \ub370\uc774\ud130\uc785\ub2c8\ub2e4.", "evaluation_metadata": {}}, "manirai91/ebiquity-v2": {"name": "manirai91/ebiquity-v2", "description": "Ebiquity V2 (non-stemmed) dataset for Nepali NER task. The dataset is tagged with BIO scheme.", "evaluation_metadata": {}}, "nateraw/imagenet-sketch": {"name": "nateraw/imagenet-sketch", "description": "ImageNet-Sketch data set consists of 50000 images, 50 images for each of the 1000 ImageNet classes.\nWe construct the data set with Google Image queries \"sketch of __\", where __ is the standard class name.\nWe only search within the \"black and white\" color scheme. We initially query 100 images for every class,\nand then manually clean the pulled images by deleting the irrelevant images and images that are for similar\nbut different classes. For some classes, there are less than 50 images after manually cleaning, and then we\naugment the data set by flipping and rotating the images.", "evaluation_metadata": {}}, "strombergnlp/rustance": {"name": "strombergnlp/rustance", "description": "This is a stance prediction dataset in Russian. The dataset contains comments on news articles,\nand rows are a comment, the title of the news article it responds to, and the stance of the comment\ntowards the article.", "evaluation_metadata": {}}, "ccdv/WCEP-10": {"name": "ccdv/WCEP-10", "description": "WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"", "evaluation_metadata": {}}, "jerpint/imagenette": {"name": "jerpint/imagenette", "description": "# ImageNette\n\nImagenette is a subset of 10 easily classified classes from Imagenet (tench, English springer, cassette player, chain saw, church, French horn, garbage truck, gas pump, golf ball, parachute).\n\n'Imagenette' is pronounced just like 'Imagenet', except with a corny inauthentic French accent.\nIf you've seen Peter Sellars in The Pink Panther, then think something like that.\nIt's important to ham up the accent as much as possible, otherwise people might not be sure whether you're refering to \"Imagenette\" or \"Imagenet\".\n(Note to native French speakers: to avoid confusion, be sure to use a corny inauthentic American accent when saying \"Imagenet\".\nThink something like the philosophy restaurant skit from Monty Python's The Meaning of Life.)\n\nThis version of the dataset allows researchers/practitioners to quickly try out\nideas and share with others. The dataset comes in three variants:\n * Full size\n * 320 px\n * 160 px\n\nThe '320 px' and '160 px' versions have their shortest side resized to that size, with their aspect ratio maintained.\n\n\nToo easy for you? In that case, you might want to try Imagewoof.\n\n# Imagewoof\nImagewoof is a subset of 10 classes from Imagenet that aren't so easy to classify, since they're all dog breeds.\nThe breeds are: Australian terrier, Border terrier, Samoyed, Beagle, Shih-Tzu, English foxhound, Rhodesian ridgeback, Dingo, Golden retriever, Old English sheepdog.\n(No we will not enter in to any discussion in to whether a dingo is in fact a dog.\nAny suggestions to the contrary are un-Australian. Thank you for your cooperation.)\n\nFull size download;\n320 px download;\n160 px download.", "evaluation_metadata": {}}, "drAbreu/sd-nlp-2": {"name": "drAbreu/sd-nlp-2", "description": " This dataset is based on the SourceData database and is intented to facilitate training of NLP tasks in the cell and molecualr biology domain.", "evaluation_metadata": {}}, "strombergnlp/offenseval_2020": {"name": "strombergnlp/offenseval_2020", "description": "OffensEval 2020 features a multilingual dataset with five languages. The languages included in OffensEval 2020 are:\n\n* Arabic\n* Danish\n* English\n* Greek\n* Turkish\n\nThe annotation follows the hierarchical tagset proposed in the Offensive Language Identification Dataset (OLID) and used in OffensEval 2019. \nIn this taxonomy we break down offensive content into the following three sub-tasks taking the type and target of offensive content into account. \nThe following sub-tasks were organized:\n\n* Sub-task A - Offensive language identification;\n* Sub-task B - Automatic categorization of offense types;\n* Sub-task C - Offense target identification.\n\nThe English training data isn't included here (the text isn't available and needs rehydration of 9 million tweets; \nsee [https://zenodo.org/record/3950379#.XxZ-aFVKipp](https://zenodo.org/record/3950379#.XxZ-aFVKipp))", "evaluation_metadata": {}}, "MilaNLProc/honest": {"name": "MilaNLProc/honest", "description": "HONEST dataset comprises a set of templates for measuring hurtful sentence completions in language models. The templates are provided in six languages (English, Italian, French, Portuguese, Romanian, and Spanish) for binary gender and in English for LGBTQAI+ individuals. WARNING: This dataset contains content that are offensive and/or hateful in nature.", "evaluation_metadata": {}}, "facebook/voxpopuli": {"name": "facebook/voxpopuli", "description": "A large-scale multilingual speech corpus for representation learning, semi-supervised learning and interpretation.", "evaluation_metadata": {}}, "strombergnlp/nordic_langid": {"name": "strombergnlp/nordic_langid", "description": "Automatic language identification is a challenging problem. Discriminating\nbetween closely related languages is especially difficult. This paper presents\na machine learning approach for automatic language identification for the\nNordic languages, which often suffer miscategorisation by existing \nstate-of-the-art tools. Concretely we will focus on discrimination between six \nNordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), \nFaroese and Icelandic.\n\nThis is the data for the tasks. Two variants are provided: 10K and 50K, with\nholding 10,000 and 50,000 examples for each language respectively.", "evaluation_metadata": {}}, "HuggingFaceM4/epic_kitchens_100": {"name": "HuggingFaceM4/epic_kitchens_100", "description": "EPIC-KITCHENS-100 is a large-scale dataset in first-person (egocentric) vision; multi-faceted, audio-visual,\nnon-scripted recordings in native environments - i.e. the wearers' homes, capturing all daily activities\nin the kitchen over multiple days. Annotations are collected using a novel 'Pause-and-Talk' narration interface.\n\nEPIC-KITCHENS-100 is an extension of the EPIC-KITCHENS dataset released in 2018, to 100 hours of footage.", "evaluation_metadata": {}}, "enoriega/odinsynth_dataset": {"name": "enoriega/odinsynth_dataset", "description": "Supervised training data for odinsynth", "evaluation_metadata": {}}, "NbAiLab/NST_hesitate": {"name": "NbAiLab/NST_hesitate", "description": "This database was created by Nordic Language Technology for the development of automatic speech recognition and dictation in Norwegian. In this version, the organization of the data have been altered to improve the usefulness of the database.\n\nThe acoustic databases described below were developed by the firm Nordisk spr\u00e5kteknologi holding AS (NST_hesitate), which went bankrupt in 2003. In 2006, a consortium consisting of the University of Oslo, the University of Bergen, the Norwegian University of Science and Technology, the Norwegian Language Council and IBM bought the bankruptcy estate of NST_hesitate, in order to ensure that the language resources developed by NST_hesitate were preserved. In 2009, the Norwegian Ministry of Culture charged the National Library of Norway with the task of creating a Norwegian language bank, which they initiated in 2010. The resources from NST_hesitate were transferred to the National Library in May 2011, and are now made available in Spr\u00e5kbanken, for the time being without any further modification. Spr\u00e5kbanken is open for feedback from users about how the resources can be improved, and we are also interested in improved versions of the databases that users wish to share with other users. Please send response and feedback to sprakbanken@nb.no.", "evaluation_metadata": {}}, "HuggingFaceM4/charades": {"name": "HuggingFaceM4/charades", "description": "Charades is dataset composed of 9848 videos of daily indoors activities collected through Amazon Mechanical Turk. 267 different users were presented with a sentence, that includes objects and actions from a fixed vocabulary, and they recorded a video acting out the sentence (like in a game of Charades). The dataset contains 66,500 temporal annotations for 157 action classes, 41,104 labels for 46 object classes, and 27,847 textual descriptions of the videos.", "evaluation_metadata": {}}, "strombergnlp/bornholmsk_parallel": {"name": "strombergnlp/bornholmsk_parallel", "description": "This dataset is parallel text for Bornholmsk and Danish. \n\nFor more details, see the paper [Bornholmsk Natural Language Processing: Resources and Tools](https://aclanthology.org/W19-6138/).", "evaluation_metadata": {}}, "lmqg/qg_subjqa": {"name": "lmqg/qg_subjqa", "description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "evaluation_metadata": {}}, "MLRS/korpus_malti": {"name": "MLRS/korpus_malti", "description": "General Corpora for the Maltese language.", "evaluation_metadata": {}}, "strombergnlp/named_timexes": {"name": "strombergnlp/named_timexes", "description": "This is a dataset annotated for _named temporal expression_ chunks.\n\nThe\ncommonest temporal expressions typically\ncontain date and time words, like April or\nhours. Research into recognising and interpreting these typical expressions is mature in many languages. However, there is\na class of expressions that are less typical,\nvery varied, and difficult to automatically\ninterpret. These indicate dates and times,\nbut are harder to detect because they often do not contain time words and are not\nused frequently enough to appear in conventional temporally-annotated corpora \u2013\nfor example *Michaelmas* or *Vasant Panchami*.\n\nFor more details see [https://aclanthology.org/R13-1015.pdf](https://aclanthology.org/R13-1015.pdf)", "evaluation_metadata": {}}, "ncats/EpiSet4NER-v2": {"name": "ncats/EpiSet4NER-v2", "description": "**REWRITE*\nEpiSet4NER-2 is a dataset generated from 620 rare disease abstracts labeled using statistical and rule-base methods. \nFor more details see *INSERT PAPER* and https://github.com/ncats/epi4GARD/tree/master/EpiExtract4GARD#epiextract4gard", "evaluation_metadata": {}}, "strombergnlp/rumoureval_2019": {"name": "strombergnlp/rumoureval_2019", "description": "\nStance prediction task in English. The goal is to predict whether a given reply to a claim either supports, denies, questions, or simply comments on the claim. Ran as a SemEval task in 2019.", "evaluation_metadata": {}}, "HuggingFaceM4/webvid": {"name": "HuggingFaceM4/webvid", "description": "WebVid is a large-scale dataset of video clips with textual descriptions sourced from the web. The videos are diverse and rich in their content.", "evaluation_metadata": {}}, "Leyo/ActivityNet_Captions": {"name": "Leyo/ActivityNet_Captions", "description": "The ActivityNet Captions dataset connects videos to a series of temporally annotated sentence descriptions.\nEach sentence covers an unique segment of the video, describing multiple events that occur. These events\nmay occur over very long or short periods of time and are not limited in any capacity, allowing them to \nco-occur. On average, each of the 20k videos contains 3.65 temporally localized sentences, resulting in\na total of 100k sentences. We find that the number of sentences per video follows a relatively normal\ndistribution. Furthermore, as the video duration increases, the number of sentences also increases. \nEach sentence has an average length of 13.48 words, which is also normally distributed. You can find more\ndetails of the dataset under the ActivityNet Captions Dataset section, and under supplementary materials \nin the paper.", "evaluation_metadata": {}}, "HuggingFaceM4/vatex": {"name": "HuggingFaceM4/vatex", "description": "VATEX is a large-scale multilingual video description dataset, which contains over 41,250 videos and 825,000 captions\nin both English and Chinese. VATEX is characterized by the following major unique properties.\nFirst, it contains both English and Chinese descriptions at scale, which can support many multilingual studies\nthat are constrained by monolingual datasets. Secondly, VATEX has a high number of clip-sentence pairs\nwith each video clip annotated with multiple unique sentences, and every caption is unique in\nthe whole corpus. Third, VATEX contains more comprehensive yet representative video content,\ncovering 600 human activities in total. Furthermore, both the English and Chinese corpora in\nVATEX are lexically richer and thus allow more natural and diverse caption generation.", "evaluation_metadata": {}}, "morteza/cogtext": {"name": "morteza/cogtext", "description": "CogText dataset contains a collection of PubMed abstracts, along with their GPT-3 embeddings and topic embeddings.", "evaluation_metadata": {}}, "nouamanetazi/test111": {"name": "nouamanetazi/test111", "description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "evaluation_metadata": {}}, "mteb/amazon_massive_scenario": {"name": "mteb/amazon_massive_scenario", "description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "evaluation_metadata": {}}, "mteb/amazon_massive_intent": {"name": "mteb/amazon_massive_intent", "description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "evaluation_metadata": {}}, "mwritescode/slither-audited-smart-contracts": {"name": "mwritescode/slither-audited-smart-contracts", "description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.", "evaluation_metadata": {}}, "wdc/products-2017": {"name": "wdc/products-2017", "description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.", "evaluation_metadata": {}}, "EMBO/sd-nlp-non-tokenized": {"name": "EMBO/sd-nlp-non-tokenized", "description": " This dataset is based on the SourceData database and is intented to facilitate training of NLP tasks in the cell and molecualr biology domain.", "evaluation_metadata": {}}, "Iyanuoluwa/YOSM": {"name": "Iyanuoluwa/YOSM", "description": "YOSM: A NEW YORUBA SENTIMENT CORPUS FOR MOVIE REVIEWS\n- Yoruba", "evaluation_metadata": {}}, "strombergnlp/x-stance": {"name": "strombergnlp/x-stance", "description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.", "evaluation_metadata": {}}, "khalidalt/tydiqa-goldp": {"name": "khalidalt/tydiqa-goldp", "description": "TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.\nThe languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language\nexpresses -- such that we expect models performing well on this set to generalize across a large number of the languages\nin the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic\ninformation-seeking task and avoid priming effects, questions are written by people who want to know the answer, but\ndon\u2019t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language without\nthe use of translation (unlike MLQA and XQuAD).", "evaluation_metadata": {}}, "WorkInTheDark/FairytaleQA": {"name": "WorkInTheDark/FairytaleQA", "description": "FairytaleQA dataset, an open-source dataset focusing on comprehension of narratives, targeting students from kindergarten to eighth grade. The FairytaleQA dataset is annotated by education experts based on an evidence-based theoretical framework. It consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations.", "evaluation_metadata": {}}, "strombergnlp/nlpcc-stance": {"name": "strombergnlp/nlpcc-stance", "description": "This is a stance prediction dataset in Chinese.\nThe data is that from a shared task, stance detection in Chinese microblogs, in NLPCC-ICCPOL 2016. It covers Task A, a mandatory supervised task which detects stance towards five targets of interest with given labeled data.", "evaluation_metadata": {}}, "HuggingFaceM4/yttemporal180m": {"name": "HuggingFaceM4/yttemporal180m", "description": "YT-Temporal-180M, a large and diverse dataset of 6 million videos (spanning 180M extracted frames)\nthat covers diverse topics.", "evaluation_metadata": {}}, "GEM/FairytaleQA": {"name": "GEM/FairytaleQA", "description": "\\\r\nThe FairytaleQA dataset focusing on narrative comprehension of kindergarten to eighth-grade students. Generated by educational experts based on an evidence-based theoretical framework, FairytaleQA consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations. This is for the Question Generation Task of FairytaleQA.", "evaluation_metadata": {}}, "mteb/bucc-bitext-mining": {"name": "mteb/bucc-bitext-mining", "description": " BUCC 2018 Shared Task test dataset", "evaluation_metadata": {}}, "strombergnlp/ans-stance": {"name": "strombergnlp/ans-stance", "description": "The dataset is a collection of news titles in arabic along with paraphrased and corrupted titles. The stance prediction version is a 3-class classification task. Data contains three columns: s1, s2, stance.", "evaluation_metadata": {}}, "imagenet_sketch": {"name": "imagenet_sketch", "description": "ImageNet-Sketch data set consists of 50000 images, 50 images for each of the 1000 ImageNet classes.\nWe construct the data set with Google Image queries \"sketch of __\", where __ is the standard class name.\nWe only search within the \"black and white\" color scheme. We initially query 100 images for every class,\nand then manually clean the pulled images by deleting the irrelevant images and images that are for similar\nbut different classes. For some classes, there are less than 50 images after manually cleaning, and then we\naugment the data set by flipping and rotating the images.", "evaluation_metadata": {}}, "ccdv/mediasum": {"name": "ccdv/mediasum", "description": "MediaSum dataset for summarization.\n From paper: \"MediaSum: A Large-scale Media Interview Dataset for Dialogue Summarization\" by C. Zhu et al.\"", "evaluation_metadata": {}}, "launch/gov_report": {"name": "launch/gov_report", "description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure", "evaluation_metadata": {}}, "launch/gov_report_qs": {"name": "launch/gov_report_qs", "description": "GovReport-QS hierarchical question-summary generation dataset.\n\nThere are two configs:\n - paragraph: paragraph-level annotated data\n - document: aggregated paragraph-level annotated data for the same document", "evaluation_metadata": {}}, "ekinakyurek/ftrace": {"name": "ekinakyurek/ftrace", "description": " Factual Tracing Dataset that contains queries and abstracts, and their corresponding ground truth.", "evaluation_metadata": {}}, "strombergnlp/ara-stance": {"name": "strombergnlp/ara-stance", "description": "The AraStance dataset contains true and false claims, where each claim is paired with one or more documents. Each claim\u2013article pair has a stance label: agree, disagree, discuss, or unrelated.", "evaluation_metadata": {}}, "GroNLP/divemt": {"name": "GroNLP/divemt", "description": "DivEMT is the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times, pauses, and perceived effort were logged, enabling an in-depth, cross-lingual evaluation of NMT quality and its post-editing process.", "evaluation_metadata": {}}, "taln-ls2n/pubmed": {"name": "taln-ls2n/pubmed", "description": "PubMed benchmark dataset for keyphrase extraction and generation.", "evaluation_metadata": {}}, "meetyildiz/toqad": {"name": "meetyildiz/toqad", "description": " Turkish Question Answering Dataset - Base", "evaluation_metadata": {}}, "mteb/amazon_reviews_multi": {"name": "mteb/amazon_reviews_multi", "description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.", "evaluation_metadata": {}}, "Lehrig/GTZAN-Collection": {"name": "Lehrig/GTZAN-Collection", "description": "The dataset consists of 1000 audio tracks each 30 seconds long.\nIt contains 10 genres, each represented by 100 tracks.\nThe tracks are all 22050Hz Mono 16-bit audio files in .wav format.\nThe genres are:\n* blues\n* classical\n* country\n* disco\n* hiphop\n* jazz\n* metal\n* pop\n* reggae\n* rock\n\nThis collection includes the following GTZAN variants:\n* raw (original WAV files)\n* melspectrograms (from each WAV file, contiguous 2-second windows at 4 random locations are sampled and transformed to Mel Spectrograms, resulting in 8000 Mel Spectrograms)", "evaluation_metadata": {}}, "taesiri/GamePhysics_Grand_Theft_Auto_V": {"name": "taesiri/GamePhysics_Grand_Theft_Auto_V", "description": "A test dataset for GamePhysics", "evaluation_metadata": {}}, "mteb/amazon_counterfactual": {"name": "mteb/amazon_counterfactual", "description": "The dataset contains sentences from Amazon customer reviews (sampled from Amazon product review dataset) annotated for counterfactual detection (CFD) binary classification. Counterfactual statements describe events that did not or cannot take place. Counterfactual statements may be identified as statements of the form \u2013 If p was true, then q would be true (i.e. assertions whose antecedent (p) and consequent (q) are known or assumed to be false).", "evaluation_metadata": {}}, "Evelyn18/becasv2": {"name": "Evelyn18/becasv2", "description": "automatic translation of the Stanford Question Answering Dataset (SQuAD) v2 into Spanish", "evaluation_metadata": {}}, "sileod/movie_recommendation": {"name": "sileod/movie_recommendation", "description": "Movie recommendation task based on the Movielens dataset", "evaluation_metadata": {}}, "sileod/discourse_marker_qa": {"name": "sileod/discourse_marker_qa", "description": "Discourse marker/connective prediction as multiple choice questions based on the Discovery dataset", "evaluation_metadata": {}}, "silver/lccc": {"name": "silver/lccc", "description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.", "evaluation_metadata": {}}, "silver/mmchat": {"name": "silver/mmchat", "description": "MMChat is a large-scale dialogue dataset that contains image-grounded dialogues in Chinese.\nEach dialogue in MMChat is associated with one or more images (maximum 9 images per dialogue).\nWe design various strategies to ensure the quality of the dialogues in MMChat.", "evaluation_metadata": {}}, "silver/personal_dialog": {"name": "silver/personal_dialog", "description": "The PersonalDialog dataset is a large-scale multi-turn Chinese dialogue dataset containing various traits from a large number of speakers.\nWe are releasing about 5M sessions of carefully filtered dialogues.\nEach utterance in PersonalDialog is associated with a speaker marked with traits like Gender, Location, Interest Tags.", "evaluation_metadata": {}}, "meetyildiz/toqad-aug": {"name": "meetyildiz/toqad-aug", "description": " Turkish Question Answering Dataset - Base", "evaluation_metadata": {}}, "juletxara/xquad_xtreme": {"name": "juletxara/xquad_xtreme", "description": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering\nperformance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set\nof SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations into ten languages: Spanish, German,\nGreek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, Hindi and Romanian. Consequently, the dataset is entirely parallel\nacross 12 languages.\nWe also include \"translate-train\", \"translate-dev\", and \"translate-test\" splits for each non-English language from XTREME (Hu et al., 2020). These can be used to run XQuAD in the \"translate-train\" or \"translate-test\" settings.", "evaluation_metadata": {}}, "Lehrig/Monkey-Species-Collection": {"name": "Lehrig/Monkey-Species-Collection", "description": "This dataset is intended as a test case for fine-grain classification tasks (10 different kinds of monkey species). The dataset consists of almost 1400 JPEG images grouped into two splits - training and validation. Each split contains 10 categories labeled as n0~n9, each corresponding a species from [Wikipedia's monkey cladogram](https://en.wikipedia.org/wiki/Monkey). Images were downloaded with help of the [googliser](https://github.com/teracow/googliser) open source code.\n\n\n| Label | Latin Name | Common Name | Train Images | Validation Images |\n| ----- | --------------------- | ------------------------- | ------------ | ----------------- |\n| n0 | alouatta_palliata | mantled_howler | 131 | 26 |\n| n1 | erythrocebus_patas | patas_monkey | 139 | 28 |\n| n2 | cacajao_calvus | bald_uakari | 137 | 27 |\n| n3 | macaca_fuscata | japanese_macaque | 152 | 30 |\n| n4 | cebuella_pygmea | pygmy_marmoset | 131 | 26 |\n| n5 | cebus_capucinus | white_headed_capuchin | 141 | 28 |\n| n6 | mico_argentatus | silvery_marmoset | 132 | 26 |\n| n7 | saimiri_sciureus | common_squirrel_monkey | 142 | 28 |\n| n8 | aotus_nigriceps | black_headed_night_monkey | 133 | 27 |\n| n9 | trachypithecus_johnii | nilgiri_langur | 132 | 26 |\n\n\nThis collection includes the following GTZAN variants:\n* original (images are 400x300 px or larger; ~550 MB)\n* downsized (images are downsized to 224x224 px; ~40 MB)", "evaluation_metadata": {}}, "DFKI-SLT/wikitext_linked": {"name": "DFKI-SLT/wikitext_linked", "description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. Dependency Relations, POS, NER tags are marked with trankit and\n entities are linked with entity-fishing.\n The dataset is available under the Creative Commons Attribution-ShareAlike License.", "evaluation_metadata": {}}, "mteb/sts22-crosslingual-sts": {"name": "mteb/sts22-crosslingual-sts", "description": "SemEval 2022 Task 8: Multilingual News Article Similarity", "evaluation_metadata": {}}, "arize-ai/ecommerce_reviews_with_language_drift": {"name": "arize-ai/ecommerce_reviews_with_language_drift", "description": "This dataset was crafted to be used in our tutorial [Link to the tutorial when\nready]. It consists on product reviews from an e-commerce store. The reviews\nare labeled on a scale from 1 to 5 (stars). The training & validation sets are\nfully composed by reviews written in english. However, the production set has\nsome reviews written in spanish. At Arize, we work to surface this issue and\nhelp you solve it.", "evaluation_metadata": {}}, "enwik8": {"name": "enwik8", "description": "The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 bytes of English Wikipedia in 2006 in XML", "evaluation_metadata": {}}, "lmqg/qg_squadshifts": {"name": "lmqg/qg_squadshifts", "description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "evaluation_metadata": {}}, "lmqg/qg_esquad": {"name": "lmqg/qg_esquad", "description": "[SQuAD-es](https://huggingface.co/datasets/squad_es) dataset for question generation (QG) task.", "evaluation_metadata": {}}, "lmqg/qg_koquad": {"name": "lmqg/qg_koquad", "description": "[KorQuAD](https://huggingface.co/datasets/squad_kor_v1) dataset for question generation (QG) task.", "evaluation_metadata": {}}, "lmqg/qg_ruquad": {"name": "lmqg/qg_ruquad", "description": "[SberSQuAD](https://huggingface.co/datasets/sberquad) dataset for question generation (QG) task.", "evaluation_metadata": {}}, "lmqg/qg_itquad": {"name": "lmqg/qg_itquad", "description": "[SQuAD-it](https://huggingface.co/datasets/squad_it) dataset for question generation (QG) task.", "evaluation_metadata": {}}, "lmqg/qg_dequad": {"name": "lmqg/qg_dequad", "description": "[GermanSQuAD](https://huggingface.co/datasets/deepset/germanquad) dataset for question generation (QG) task.", "evaluation_metadata": {}}, "anton-l/earnings22": {"name": "anton-l/earnings22", "description": "The Earnings 22 dataset ( also referred to as earnings22 ) is a 119-hour corpus of English-language earnings calls collected from global companies. \nThe primary purpose is to serve as a benchmark for industrial and academic automatic speech recognition (ASR) models on real-world accented speech.", "evaluation_metadata": {}}, "BlackSamorez/2ch_b_dialogues": {"name": "BlackSamorez/2ch_b_dialogues", "description": "Dialogues build from 2ch.hk/b/ threads", "evaluation_metadata": {}}, "carblacac/twitter-sentiment-analysis": {"name": "carblacac/twitter-sentiment-analysis", "description": "The Twitter Sentiment Analysis Dataset contains 1,578,627 classified tweets, each row is marked as 1 for positive sentiment and 0 for negative sentiment.\nThe dataset is based on data from the following two sources:\n\nUniversity of Michigan Sentiment Analysis competition on Kaggle\nTwitter Sentiment Corpus by Niek Sanders\n\nFinally, I randomly selected a subset of them, applied a cleaning process, and divided them between the test and train subsets, keeping a balance between\nthe number of positive and negative tweets within each of these subsets.", "evaluation_metadata": {}}, "JeremyAlain/123_test": {"name": "JeremyAlain/123_test", "description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"", "evaluation_metadata": {}}, "nlpaueb/multi_eurlex": {"name": "nlpaueb/multi_eurlex", "description": "An non-parallel version of the MultiEURLEX datasets released by Chalkidis et al. (2021). \nMultiEURLEX comprises 65k EU laws in 23 official EU languages (some low-ish resource).\nEach EU law has been annotated with EUROVOC concepts (labels) by the Publication Office of EU.\nAs with the English EURLEX, the goal is to predict the relevant EUROVOC concepts (labels);\nthis is multi-label classification task (given the text, predict multiple labels).\nIn this version, MultiEURLEX comprises non-parallel documents across 5 languages (English, German, French, Greek, \nand Slovakian) including translations from English to the rest of the 4 available languages.", "evaluation_metadata": {}}, "linxinyuan/imdb": {"name": "linxinyuan/imdb", "description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.\\", "evaluation_metadata": {}}, "juletxara/tydiqa_xtreme": {"name": "juletxara/tydiqa_xtreme", "description": "TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.\nThe languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language\nexpresses -- such that we expect models performing well on this set to generalize across a large number of the languages\nin the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic\ninformation-seeking task and avoid priming effects, questions are written by people who want to know the answer, but\ndon\u2019t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language without\nthe use of translation (unlike MLQA and XQuAD).\n\nWe also include \"translate-train\" and \"translate-test\" splits for each non-English languages from XTREME (Hu et al., 2020). These splits are the automatic translations from English to each target language used in the XTREME paper [https://arxiv.org/abs/2003.11080]. The \"translate-train\" split purposefully ignores the non-English TyDiQA-GoldP training data to simulate the transfer learning scenario where original-language data is not available and system builders must rely on labeled English data plus existing machine translation systems.", "evaluation_metadata": {}}, "truthful_qa": {"name": "truthful_qa", "description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.", "evaluation_metadata": {}}, "quickdraw": {"name": "quickdraw", "description": "The Quick Draw Dataset is a collection of 50 million drawings across 345 categories, contributed by players of the game Quick, Draw!.\nThe drawings were captured as timestamped vectors, tagged with metadata including what the player was asked to draw and in which country the player was located.", "evaluation_metadata": {}}, "sil-ai/bloom-speech": {"name": "sil-ai/bloom-speech", "description": "Bloom-speech is a dataset of text aligned speech from bloomlibrary.org. This dataset contains over 50 languages including many low-resource languages. This dataset should be useful for training and/or testing speech-to-text or text-to-speech/ASR models.", "evaluation_metadata": {}}, "speechcolab/gigaspeech": {"name": "speechcolab/gigaspeech", "description": "GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality\nlabeled audio suitable for supervised training, and 40,000 hours of total audio suitable for semi-supervised\nand unsupervised training. Around 40,000 hours of transcribed audio is first collected from audiobooks, podcasts\nand YouTube, covering both read and spontaneous speaking styles, and a variety of topics, such as arts, science,\nsports, etc. A new forced alignment and segmentation pipeline is proposed to create sentence segments suitable\nfor speech recognition training, and to filter out segments with low-quality transcription. For system training,\nGigaSpeech provides five subsets of different sizes, 10h, 250h, 1000h, 2500h, and 10000h.\nFor our 10,000-hour XL training subset, we cap the word error rate at 4% during the filtering/validation stage,\nand for all our other smaller training subsets, we cap it at 0%. The DEV and TEST evaluation sets, on the other hand,\nare re-processed by professional human transcribers to ensure high transcription quality.", "evaluation_metadata": {}}, "Theivaprakasham/wildreceipt": {"name": "Theivaprakasham/wildreceipt", "description": "WildReceipt is a collection of receipts. It contains, for each photo, a list of OCRs - with the bounding box, text, and class. It contains 1765 photos, with 25 classes, and 50000 text boxes. The goal is to benchmark \"key information extraction\" - extracting key information from documents\nhttps://arxiv.org/abs/2103.14470", "evaluation_metadata": {}}, "sagot/lefff_morpho": {"name": "sagot/lefff_morpho", "description": "The lefff-morpho dataset gives access to the morphological information, in both its original format and the UniMorph format.", "evaluation_metadata": {}}, "amueller/syntactic_transformations": {"name": "amueller/syntactic_transformations", "description": "This is the dataset used for Coloring the Blank Slate: \nPre-training Imparts a Hierarchical Inductive Bias to\nSequence-to-sequence Models.", "evaluation_metadata": {}}, "Rodekool/ornl8": {"name": "Rodekool/ornl8", "description": "still a WIP, Dataset originally comes from Open Data van de Rechtspraak\"", "evaluation_metadata": {}}, "sst2": {"name": "sst2", "description": "The Stanford Sentiment Treebank consists of sentences from movie reviews and\nhuman annotations of their sentiment. The task is to predict the sentiment of a\ngiven sentence. We use the two-way (positive/negative) class split, and use only\nsentence-level labels.", "evaluation_metadata": {}}, "gsarti/magpie": {"name": "gsarti/magpie", "description": "The MAGPIE corpus is a large sense-annotated corpus of potentially idiomatic expressions (PIEs), based on the British National Corpus (BNC). Potentially idiomatic expressions are like idiomatic expressions, but the term also covers literal uses of idiomatic expressions, such as 'I leave work at the end of the day.' for the idiom 'at the end of the day'. This version of the dataset reflects the filtered subset used by Dankers et al. (2022) in their investigation on how PIEs are represented by NMT models. Authors use 37k samples annotated as fully figurative or literal, for 1482 idioms that contain nouns, numerals or adjectives that are colours (which they refer to as keywords). Because idioms show syntactic and morphological variability, the focus is mostly put on nouns. PIEs and their context are separated using the original corpus\u2019s word-level annotations.", "evaluation_metadata": {}}, "PiC/phrase_retrieval": {"name": "PiC/phrase_retrieval", "description": "Phrase in Context is a curated benchmark for phrase understanding and semantic search, consisting of three tasks of increasing difficulty: Phrase Similarity (PS), Phrase Retrieval (PR) and Phrase Sense Disambiguation (PSD). The datasets are annotated by 13 linguistic experts on Upwork and verified by two groups: ~1000 AMT crowdworkers and another set of 5 linguistic experts. PiC benchmark is distributed under CC-BY-NC 4.0.", "evaluation_metadata": {}}, "PiC/phrase_sense_disambiguation": {"name": "PiC/phrase_sense_disambiguation", "description": "Phrase in Context is a curated benchmark for phrase understanding and semantic search, consisting of three tasks of increasing difficulty: Phrase Similarity (PS), Phrase Retrieval (PR) and Phrase Sense Disambiguation (PSD). The datasets are annotated by 13 linguistic experts on Upwork and verified by two groups: ~1000 AMT crowdworkers and another set of 5 linguistic experts. PiC benchmark is distributed under CC-BY-NC 4.0.", "evaluation_metadata": {}}, "PiC/phrase_similarity": {"name": "PiC/phrase_similarity", "description": "Phrase in Context is a curated benchmark for phrase understanding and semantic search, consisting of three tasks of increasing difficulty: Phrase Similarity (PS), Phrase Retrieval (PR) and Phrase Sense Disambiguation (PSD). The datasets are annotated by 13 linguistic experts on Upwork and verified by two groups: ~1000 AMT crowdworkers and another set of 5 linguistic experts. PiC benchmark is distributed under CC-BY-NC 4.0.", "evaluation_metadata": {}}, "taskydata/tasky_or_not": {"name": "taskydata/tasky_or_not", "description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.", "evaluation_metadata": {}}, "lccc": {"name": "lccc", "description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.", "evaluation_metadata": {}}, "gcaillaut/frwiki_el": {"name": "gcaillaut/frwiki_el", "description": "French Wikipedia dataset for Entity Linking", "evaluation_metadata": {}}, "codeparrot/apps": {"name": "codeparrot/apps", "description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.", "evaluation_metadata": {}}, "fmplaza/offendes": {"name": "fmplaza/offendes", "description": "Focusing on young influencers from the well-known social platforms of Twitter, Instagram, and YouTube, \nwe have collected the corpus OffendES which is composed of Spanish comments manually labeled on offensive pre-defined categories. From the total corpus, we selected 30,416 \nposts to be publicly published, they correspond to the ones used in the MeOffendES competition at IberLEF 2021.", "evaluation_metadata": {}}, "vesteinn/sosialurin-faroese-pos": {"name": "vesteinn/sosialurin-faroese-pos", "description": "The corpus that has been created consists of ca. 100.000 words of text from the [Faroese] newspaper Sosialurin. Each word is tagged with grammatical information (word class, gender, number etc.)", "evaluation_metadata": {}}, "khalidalt/tydiqa-primary": {"name": "khalidalt/tydiqa-primary", "description": "TyDi QA is a question answering dataset covering 11 typologically diverse languages with 204K question-answer pairs.\nThe languages of TyDi QA are diverse with regard to their typology -- the set of linguistic features that each language\nexpresses -- such that we expect models performing well on this set to generalize across a large number of the languages\nin the world. It contains language phenomena that would not be found in English-only corpora. To provide a realistic\ninformation-seeking task and avoid priming effects, questions are written by people who want to know the answer, but\ndon\u2019t know the answer yet, (unlike SQuAD and its descendents) and the data is collected directly in each language without\nthe use of translation (unlike MLQA and XQuAD).", "evaluation_metadata": {}}, "vadis/sv-ident": {"name": "vadis/sv-ident", "description": "The SV-Ident corpus (version 0.3) is a collection of 4,248 expert-annotated English\nand German sentences from social science publications, supporting the task of\nmulti-label text classification.", "evaluation_metadata": {}}, "facebook/pmd": {"name": "facebook/pmd", "description": "Introduced in FLAVA paper, Public Multimodal Dataset (PMD) is a collection of publicly-available image-text pairs datasets. PMD in total contains 70M image-text pairs with 68M unique images. The dataset contains pairs from Conceptual Captions, Conceptual Captions 12M, WIT, Localized Narratives, RedCaps, COCO, SBU Captions, Visual Genome and a subset of YFCC100M dataset.", "evaluation_metadata": {}}, "hugginglearners/malayalam_news": {"name": "hugginglearners/malayalam_news", "description": "The AI4Bharat-IndicNLP dataset is an ongoing effort to create a collection of large-scale, \ngeneral-domain corpora for Indian languages. Currently, it contains 2.7 billion words for 10 Indian languages from two language families. \nWe share pre-trained word embeddings trained on these corpora.\nWe create news article category classification datasets for 9 languages to evaluate the embeddings.\nWe evaluate the IndicNLP embeddings on multiple evaluation tasks.", "evaluation_metadata": {}}, "anton-l/earnings22_robust": {"name": "anton-l/earnings22_robust", "description": "\\nThe Earnings 22 dataset ( also referred to as earnings22 ) is a 119-hour corpus of English-language earnings calls collected from global companies. \nThe primary purpose is to serve as a benchmark for industrial and academic automatic speech recognition (ASR) models on real-world accented speech.", "evaluation_metadata": {}}, "huggingartists/headie-one": {"name": "huggingartists/headie-one", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "Shayanvsf/pquad_public": {"name": "Shayanvsf/pquad_public", "description": "\\\\\\ParSQuAD: Persian Question Answering Dataset based on Machine Translation of SQuAD 2.0", "evaluation_metadata": {}}, "JulesBelveze/tldr_news": {"name": "JulesBelveze/tldr_news", "description": "The `tldr_news` dataset was constructed by collecting a daily tech newsletter (available at \nhttps://tldr.tech/newsletter). Then for every piece of news, the \"headline\" and its corresponding \"content\" were \ncollected. Such a dataset can be used to train a model to generate a headline from a input piece of text.", "evaluation_metadata": {}}, "jalFaizy/detect_chess_pieces": {"name": "jalFaizy/detect_chess_pieces", "description": "The \"Object Detection for Chess Pieces\" dataset is a toy dataset created (as suggested by the name!) to introduce object detection in a beginner friendly way.", "evaluation_metadata": {}}, "HuggingFaceM4/VQAv2": {"name": "HuggingFaceM4/VQAv2", "description": "VQA is a new dataset containing open-ended questions about images. These questions require an understanding of vision, language and commonsense knowledge to answer.", "evaluation_metadata": {}}, "flexthink/librig2p-nostress-space-cmu": {"name": "flexthink/librig2p-nostress-space-cmu", "description": "Grapheme-to-Phoneme training, validation and test sets", "evaluation_metadata": {}}, "projecte-aina/ca_zh_wikipedia": {"name": "projecte-aina/ca_zh_wikipedia", "description": "The CA-ZH Parallel Corpus is a Catalan-Chinese dataset of mutual translations automatically crawled from Wikipedia. Two separate corpora are included, namely CA-ZH 1.05 Wikipedia and CA-ZH 1.10 Wikipedia, the latter has better general quality than the former. The dataset was created to support Catalan NLP tasks, e.g. Machine Translation.", "evaluation_metadata": {}}, "launch/open_question_type": {"name": "launch/open_question_type", "description": "Open-ended question type annotated dataset.", "evaluation_metadata": {}}, "ctu-aic/enfever_nli": {"name": "ctu-aic/enfever_nli", "description": "EnfeverNLI is a NLI version of the fever dataset", "evaluation_metadata": {}}, "projecte-aina/catalanqa": {"name": "projecte-aina/catalanqa", "description": "CatalanQA: an extractive QA dataset from original Catalan Sources: Wikipedia and VilaWeb newswire.\n\n It is an aggregation and balancing of 2 previous datasets: VilaQUAD and ViquiQUAD, which were described in \n\nThis dataset can be used to build extractive-QA and Language Models.\n\nSplts have been balanced by kind of question, and unlike other datasets like SQUAD, it only contains, per record, one question and one answer for each context, although the contexts can repeat multiple times.\n\n- test.json \tcontains 2135 question/answer pairs\n\n- train.json \tcontains\t 17135 question/answer pairs\n\n- dev.json contains 2157 question/answer pairs\n\nFunded by the Generalitat de Catalunya, Departament de Pol\u00edtiques Digitals i Administraci\u00f3 P\u00fablica (AINA),\n and Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).", "evaluation_metadata": {}}, "kensho/spgispeech": {"name": "kensho/spgispeech", "description": "The SPGISpeech corpus is derived from company earnings calls manually transcribed by S&P Global, Inc. according to a pro- fessional style guide detailing conventions for capitalization, punctuation, denormalization of non-standard words and tran- scription of disfluencies in spontaneous speech. The basic unit of SPGISpeech is a pair consisting of a 5 to 15 second long 16 bit, 16kHz mono wav audio file and its transcription..", "evaluation_metadata": {}}, "codeparrot/github-code-clean": {"name": "codeparrot/github-code-clean", "description": "The GitHub Code clean dataset in a more filtered version of codeparrot/github-code dataset, it consists of 115M code files from GitHub in 32 programming languages with 60 extensions totaling in almost 1TB of text data.", "evaluation_metadata": {}}, "sunlixu/Uyghur": {"name": "sunlixu/Uyghur", "description": "This new dataset is from Xinjiang University and to do some ASR in low resource.", "evaluation_metadata": {}}, "PolyAI/evi": {"name": "PolyAI/evi", "description": "EVI is a challenging spoken multilingual dataset with 5,506 dialogues in English, Polish, and French \nthat can be used for benchmarking and developing knowledge-based enrolment, identification, and identification \nfor spoken dialogue systems.", "evaluation_metadata": {}}, "arize-ai/xtreme_en": {"name": "arize-ai/xtreme_en", "description": "This dataset was crafted to be used in our tutorial [Link to the tutorial when\nready]. It consists on product reviews from an e-commerce store. The reviews\nare labeled on a scale from 1 to 5 (stars). The training & validation sets are\nfully composed by reviews written in english. However, the production set has\nsome reviews written in spanish. At Arize, we work to surface this issue and\nhelp you solve it.", "evaluation_metadata": {}}, "arize-ai/xtreme_en_language_drift_es": {"name": "arize-ai/xtreme_en_language_drift_es", "description": "This dataset was crafted to be used in our tutorial [Link to the tutorial when\nready]. It consists on product reviews from an e-commerce store. The reviews\nare labeled on a scale from 1 to 5 (stars). The training & validation sets are\nfully composed by reviews written in english. However, the production set has\nsome reviews written in spanish. At Arize, we work to surface this issue and\nhelp you solve it.", "evaluation_metadata": {}}, "arize-ai/xtreme_en_token_drift": {"name": "arize-ai/xtreme_en_token_drift", "description": "This dataset was crafted to be used in our tutorial [Link to the tutorial when\nready]. It consists on product reviews from an e-commerce store. The reviews\nare labeled on a scale from 1 to 5 (stars). The training & validation sets are\nfully composed by reviews written in english. However, the production set has\nsome reviews written in spanish. At Arize, we work to surface this issue and\nhelp you solve it.", "evaluation_metadata": {}}, "benschill/brain-tumor-collection": {"name": "benschill/brain-tumor-collection", "description": "This dataset is intended as a test case for classification tasks (4 different kinds of brain xrays). The dataset consists of almost 1400 JPEG images grouped into two splits - training and validation. Each split contains 4 categories labeled as n0~n3, each corresponding to a cancer result of the mrt.\n\n\n| Label | Xray Category | Train Images | Validation Images |\n| ----- | --------------------- | ------------ | ----------------- |\n| n0 | glioma_tumor | 826 | 100 |\n| n1 | meningioma_tumor | 822 | 115 |\n| n2 | pituitary_tumor | 827 | 74 |\n| n3 | no_tumor | 395 | 105 |", "evaluation_metadata": {}}, "MicPie/unpredictable_full": {"name": "MicPie/unpredictable_full", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "bigbio/scitail": {"name": "bigbio/scitail", "description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and\nweb sentences. Each question and the correct answer choice are converted into an assertive\nstatement to form the hypothesis. We use information retrieval to obtain relevant text from\na large text corpus of web sentences, and use these sentences as a premise P. We crowdsource\nthe annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order\nto create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with\nentails label and 16,925 examples with neutral label.", "evaluation_metadata": {}}, "MicPie/unpredictable_mmo-champion-com": {"name": "MicPie/unpredictable_mmo-champion-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_baseball-fantasysports-yahoo-com": {"name": "MicPie/unpredictable_baseball-fantasysports-yahoo-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_phonearena-com": {"name": "MicPie/unpredictable_phonearena-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_support-google-com": {"name": "MicPie/unpredictable_support-google-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_dividend-com": {"name": "MicPie/unpredictable_dividend-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_bulbapedia-bulbagarden-net": {"name": "MicPie/unpredictable_bulbapedia-bulbagarden-net", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_wkdu-org": {"name": "MicPie/unpredictable_wkdu-org", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_dummies-com": {"name": "MicPie/unpredictable_dummies-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_mgoblog-com": {"name": "MicPie/unpredictable_mgoblog-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_gamefaqs-com": {"name": "MicPie/unpredictable_gamefaqs-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_studystack-com": {"name": "MicPie/unpredictable_studystack-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_sittercity-com": {"name": "MicPie/unpredictable_sittercity-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_msdn-microsoft-com": {"name": "MicPie/unpredictable_msdn-microsoft-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cappex-com": {"name": "MicPie/unpredictable_cappex-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_en-wikipedia-org": {"name": "MicPie/unpredictable_en-wikipedia-org", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cram-com": {"name": "MicPie/unpredictable_cram-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_w3-org": {"name": "MicPie/unpredictable_w3-org", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_sporcle-com": {"name": "MicPie/unpredictable_sporcle-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_wiki-openmoko-org": {"name": "MicPie/unpredictable_wiki-openmoko-org", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_ensembl-org": {"name": "MicPie/unpredictable_ensembl-org", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "BDas/Turkish-Dataset": {"name": "BDas/Turkish-Dataset", "description": "The dataset, prepared in Turkish, includes 53.000 tests, 53.000 validations and 160600 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "evaluation_metadata": {}}, "andreagasparini/librispeech_test_only": {"name": "andreagasparini/librispeech_test_only", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "MicPie/unpredictable_5k": {"name": "MicPie/unpredictable_5k", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "biglam/cultural_heritage_metadata_accuracy": {"name": "biglam/cultural_heritage_metadata_accuracy", "description": "The dataset contains more than 100K textual descriptions of cultural items from Cultura Italia (http://www.culturaitalia.it/opencms/index.jsp?language=en), the Italian National Cultural aggregator. \nEach of the description is labeled either HIGH or LOW quality, according its adherence to the standard cataloguing guidelines provided by Istituto Centrale per il Catalogo e la Documentazione (ICCD).", "evaluation_metadata": {}}, "SocialGrep/one-year-of-tsla-on-reddit": {"name": "SocialGrep/one-year-of-tsla-on-reddit", "description": "This dataset contains all the posts and comments mentioning the term \"TSLA\", spanning from July 5th, 2021 to July 4th, 2022.", "evaluation_metadata": {}}, "MicPie/unpredictable_unique": {"name": "MicPie/unpredictable_unique", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster-noise": {"name": "MicPie/unpredictable_cluster-noise", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster00": {"name": "MicPie/unpredictable_cluster00", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster01": {"name": "MicPie/unpredictable_cluster01", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster10": {"name": "MicPie/unpredictable_cluster10", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster11": {"name": "MicPie/unpredictable_cluster11", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster12": {"name": "MicPie/unpredictable_cluster12", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster13": {"name": "MicPie/unpredictable_cluster13", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster14": {"name": "MicPie/unpredictable_cluster14", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster15": {"name": "MicPie/unpredictable_cluster15", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster16": {"name": "MicPie/unpredictable_cluster16", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster17": {"name": "MicPie/unpredictable_cluster17", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster18": {"name": "MicPie/unpredictable_cluster18", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster19": {"name": "MicPie/unpredictable_cluster19", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster02": {"name": "MicPie/unpredictable_cluster02", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster20": {"name": "MicPie/unpredictable_cluster20", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster21": {"name": "MicPie/unpredictable_cluster21", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster22": {"name": "MicPie/unpredictable_cluster22", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster23": {"name": "MicPie/unpredictable_cluster23", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster24": {"name": "MicPie/unpredictable_cluster24", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster25": {"name": "MicPie/unpredictable_cluster25", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster26": {"name": "MicPie/unpredictable_cluster26", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster27": {"name": "MicPie/unpredictable_cluster27", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster28": {"name": "MicPie/unpredictable_cluster28", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster29": {"name": "MicPie/unpredictable_cluster29", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster03": {"name": "MicPie/unpredictable_cluster03", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster04": {"name": "MicPie/unpredictable_cluster04", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster05": {"name": "MicPie/unpredictable_cluster05", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster06": {"name": "MicPie/unpredictable_cluster06", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster07": {"name": "MicPie/unpredictable_cluster07", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster08": {"name": "MicPie/unpredictable_cluster08", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_cluster09": {"name": "MicPie/unpredictable_cluster09", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "changxin/test_pq": {"name": "changxin/test_pq", "description": "This is a test dataset.", "evaluation_metadata": {}}, "MicPie/unpredictable_rated-low": {"name": "MicPie/unpredictable_rated-low", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_rated-medium": {"name": "MicPie/unpredictable_rated-medium", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "MicPie/unpredictable_rated-high": {"name": "MicPie/unpredictable_rated-high", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "Heriot-WattUniversity/dialog_babi": {"name": "Heriot-WattUniversity/dialog_babi", "description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.", "evaluation_metadata": {}}, "Sreyan88/librispeech_asr": {"name": "Sreyan88/librispeech_asr", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "chenz16/curriculum_benchmark": {"name": "chenz16/curriculum_benchmark", "description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.", "evaluation_metadata": {}}, "biglam/brill_iconclass": {"name": "biglam/brill_iconclass", "description": "A dataset for applying machine learning to collections described with the Iconclass classification system.", "evaluation_metadata": {}}, "biglam/atypical_animacy": {"name": "biglam/atypical_animacy", "description": "Atypical animacy detection dataset, based on nineteenth-century sentences in English extracted from an open dataset of nineteenth-century books digitized by the British Library (available via https://doi.org/10.21250/db14, British Library Labs, 2014). \nThis dataset contains 598 sentences containing mentions of machines. Each sentence has been annotated according to the animacy and humanness of the machine in the sentence.", "evaluation_metadata": {}}, "huggingartists/ciggy-blacc": {"name": "huggingartists/ciggy-blacc", "description": "This dataset is designed to generate lyrics with HuggingArtists.", "evaluation_metadata": {}}, "04-07-22/wep-probes": {"name": "04-07-22/wep-probes", "description": "Probing neural language models for understanding of words of estimative probability\nAnonymous submission", "evaluation_metadata": {}}, "codeparrot/xlcost-text-to-code": {"name": "codeparrot/xlcost-text-to-code", "description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "evaluation_metadata": {}}, "demelin/moral_stories": {"name": "demelin/moral_stories", "description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.", "evaluation_metadata": {}}, "demelin/wino_x": {"name": "demelin/wino_x", "description": "Wino-X is a parallel dataset of German, French, and Russian Winograd schemas, aligned with their English \ncounterparts, used to examine whether neural machine translation models can perform coreference resolution that \nrequires commonsense knowledge and whether multilingual language models are capable of commonsense reasoning across \nmultiple languages.", "evaluation_metadata": {}}, "demelin/understanding_fables": {"name": "demelin/understanding_fables", "description": "This task aims to measure the ability of computational models to understand short narratives, by identifying the most \nappropriate moral for a given fable from a set of five alternatives.", "evaluation_metadata": {}}, "ArthurBaia/squad_v1_pt_br": {"name": "ArthurBaia/squad_v1_pt_br", "description": "This dataset was translated by Deep Learning Brazil", "evaluation_metadata": {}}, "colbertv2/lotte": {"name": "colbertv2/lotte", "description": "LoTTE Passages Dataset for ColBERTv2", "evaluation_metadata": {}}, "colbertv2/lotte_passages": {"name": "colbertv2/lotte_passages", "description": "LoTTE Passages Dataset for ColBERTv2", "evaluation_metadata": {}}, "Rodekool/ornl26": {"name": "Rodekool/ornl26", "description": "still a WIP, Dataset originally comes from Open Data van de Rechtspraak\"", "evaluation_metadata": {}}, "nbroad/mediasum": {"name": "nbroad/mediasum", "description": "This large-scale media interview dataset contains 463.6K transcripts with abstractive summaries, \ncollected from interview transcripts and overview / topic descriptions from NPR and CNN.", "evaluation_metadata": {}}, "tner/bc5cdr": {"name": "tner/bc5cdr", "description": "[Bio Creative 5 CDR NER dataset](https://academic.oup.com/database/article/doi/10.1093/database/baw032/2630271?login=true)", "evaluation_metadata": {}}, "pnr-svc/Turkish-Multiclass-Dataset": {"name": "pnr-svc/Turkish-Multiclass-Dataset", "description": "The dataset, prepared in Turkish, includes 10.000 tests, 10.000 validations and 33000 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "evaluation_metadata": [{"config": "TurkishMulticlassDataset", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}}]}, "biglam/old_bailey_proceedings": {"name": "biglam/old_bailey_proceedings", "description": "The dataset consists of 2,163 transcriptions of the Proceedings and 475 Ordinary's Accounts marked up in TEI-XML, \nand contains some documentation covering the data structure and variables. Each Proceedings file represents one session of the court (1674-1913), \nand each Ordinary's Account file represents a single pamphlet (1676-1772)", "evaluation_metadata": {}}, "Muennighoff/xwinograd": {"name": "Muennighoff/xwinograd", "description": "A multilingual collection of Winograd Schemas in six languages that can be used for evaluation of cross-lingual commonsense reasoning capabilities.", "evaluation_metadata": {}}, "pyronear/openfire": {"name": "pyronear/openfire", "description": "OpenFire is an image classification dataset for wildfire detection, collected\nfrom web searches.", "evaluation_metadata": {}}, "biglam/clmet_3_1": {"name": "biglam/clmet_3_1", "description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification", "evaluation_metadata": {}}, "frgfm/imagenette": {"name": "frgfm/imagenette", "description": "Imagenette is a subset of 10 easily classified classes from Imagenet\n(tench, English springer, cassette player, chain saw, church, French\nhorn, garbage truck, gas pump, golf ball, parachute).", "evaluation_metadata": {}}, "allenai/mslr2022": {"name": "allenai/mslr2022", "description": "The Multidocument Summarization for Literature Review (MSLR) Shared Task aims to study how medical\nevidence from different clinical studies are summarized in literature reviews. Reviews provide the\nhighest quality of evidence for clinical care, but are expensive to produce manually.\n(Semi-)automation via NLP may facilitate faster evidence synthesis without sacrificing rigor.\nThe MSLR shared task uses two datasets to assess the current state of multidocument summarization\nfor this task, and to encourage the development of modeling contributions, scaffolding tasks, methods\nfor model interpretability, and improved automated evaluation methods in this domain.", "evaluation_metadata": {}}, "Muennighoff/mbpp": {"name": "Muennighoff/mbpp", "description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases.", "evaluation_metadata": {}}, "RaphaelOlivier/librispeech_asr_adversarial": {"name": "RaphaelOlivier/librispeech_asr_adversarial", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n```python\nimport soundfile as sf\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "biglam/lampeter_corpus": {"name": "biglam/lampeter_corpus", "description": "The Lampeter Corpus of Early Modern English Tracts is a collection of texts on\n various subject matter published between 1640 and 1740 \u2013 a time that is marked by the rise of mass \n publication, the development of a public discourse in many areas of everyday life \n and, last but not least, the standardisation of British English.", "evaluation_metadata": {}}, "breakend/nllb-multi-domain": {"name": "breakend/nllb-multi-domain", "description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.", "evaluation_metadata": {}}, "biglam/lancaster_newsbooks": {"name": "biglam/lancaster_newsbooks", "description": "This corpus consists of two collections of seventeenth-century English \"newsbooks\". Both were drawn from the Thomason Tracts collection, which is held at the British Library and available in graphical form via Early English Books Online (EEBO). The construction of these keyboarded versions were in both cases funded by the British Academy.\nThe FIRST collection (1654_newsbooks) consists of every newsbook published in London and still surviving in the Thomason Tracts from the first half of 1654 (to be precise, for the second half of December 1653 to the end of May 1654, with one or two additions from the first week in June, 1654). This was constructed for the project \"Looking at text re-use in a corpus of seventeenth-century news reportage\", funded by the British Academy, grant reference SG-33825. \nThe SECOND collection (mercurius_fumigosus) consists of every surviving issue published of the highly idiosyncratic newsbook \"Mercurius Fumigosus\", written by John Crouch between summer 1654 and early autumn 1655. This was constructed for the project \"Decoding the news - Mercurius Fumigosus as a source of news in the interregnum, 1654-1655\", funded by the British Academy, grant reference LRG-35423. \nThis is version 1.0 of the corpus, released April 2007; it supercedes earlier versions circulated informally.\nFor more information about the corpus, see www.ling.lancs.ac.uk/newsbooks", "evaluation_metadata": {}}, "Gpaiva/NERDE": {"name": "Gpaiva/NERDE", "description": "(pt) NERDE \u00e9 um dataset para NER a partir de documentos jur\u00eddicos da defesa econ\u00f4mica em portugu\u00eas do Brasil, foi criado em colabora\u00e7\u00e3o com o Cade e o laborat\u00f3rio LATITUDE/UnB.\n(en) NERDE is a NER dataset from economic defense legal documents in Brazilian Portuguese, created in collaboration with Cade and the LATITUDE/UnB laboratory.", "evaluation_metadata": {}}, "SocialGrep/reddit-r-bitcoin-data-for-jun-2022": {"name": "SocialGrep/reddit-r-bitcoin-data-for-jun-2022", "description": "Lite version of our Reddit /r/Bitcoin dataset - CSV of all posts & comments to the /r/Bitcoin subreddit over Jun 2022.", "evaluation_metadata": {}}, "Kamrani/en-fa-translation": {"name": "Kamrani/en-fa-translation", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "muibk/wmt19_metrics_task": {"name": "muibk/wmt19_metrics_task", "description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.", "evaluation_metadata": {}}, "frgfm/imagewoof": {"name": "frgfm/imagewoof", "description": "Imagewoof is a subset of 10 classes from Imagenet that aren't so \neasy to classify, since they're all dog breeds. The breeds are: \nAustralian terrier, Border terrier, Samoyed, Beagle, Shih-Tzu, \nEnglish foxhound, Rhodesian ridgeback, Dingo, Golden retriever, \nOld English sheepdog.", "evaluation_metadata": {}}, "tarteel-ai/quranqa": {"name": "tarteel-ai/quranqa", "description": "The absence of publicly available reusable test collections for Arabic question answering on the Holy Qur\u2019an has impeded the possibility of fairly comparing the performance of systems in that domain. In this article, we introduce AyaTEC, a reusable test collection for verse-based question answering on the Holy Qur\u2019an, which serves as a common experimental testbed for this task. AyaTEC includes 207 questions (with their corresponding 1,762 answers) covering 11 topic categories of the Holy Qur\u2019an that target the information needs of both curious and skeptical users. To the best of our effort, the answers to the questions (each represented as a sequence of verses) in AyaTEC were exhaustive\u2014that is, all qur\u2019anic verses that directly answered the questions were exhaustively extracted and annotated. To facilitate the use of AyaTEC in evaluating the systems designed for that task, we propose several evaluation measures to support the different types of questions and the nature of verse-based answers while integrating the concept of partial matching of answers in the evaluation.", "evaluation_metadata": {}}, "biglam/contentious_contexts": {"name": "biglam/contentious_contexts", "description": "This dataset contains extracts from historical Dutch newspapers which have been containing keywords of potentially contentious words (according to present-day sensibilities). \nThe dataset contains multiple annotations per instance, given the option to quantify agreement scores for annotations. This dataset can be used to track how words and their meanings have changed over time", "evaluation_metadata": {}}, "chintagunta85/bc2gm_test": {"name": "chintagunta85/bc2gm_test", "description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll", "evaluation_metadata": {}}, "kiddothe2b/contract-nli": {"name": "kiddothe2b/contract-nli", "description": "ContractNLI: A Benchmark Dataset for ContractNLI in English", "evaluation_metadata": {}}, "chintagunta85/bc4chemd": {"name": "chintagunta85/bc4chemd", "description": "The automatic extraction of chemical information from text requires the recognition of chemical entity mentions as one of its key steps. When developing supervised named entity recognition (NER) systems, the availability of a large, manually annotated text corpus is desirable. Furthermore, large corpora permit the robust evaluation and comparison of different approaches that detect chemicals in documents. We present the CHEMDNER corpus, a collection of 10,000 PubMed abstracts that contain a total of 84,355 chemical entity mentions labeled manually by expert chemistry literature curators, following annotation guidelines specifically defined for this task. The abstracts of the CHEMDNER corpus were selected to be representative for all major chemical disciplines. Each of the chemical entity mentions was manually labeled according to its structure-associated chemical entity mention (SACEM) class: abbreviation, family, formula, identifier, multiple, systematic and trivial. The difficulty and consistency of tagging chemicals in text was measured using an agreement study between annotators, obtaining a percentage agreement of 91. For a subset of the CHEMDNER corpus (the test set of 3,000 abstracts) we provide not only the Gold Standard manual annotations, but also mentions automatically detected by the 26 teams that participated in the BioCreative IV CHEMDNER chemical mention recognition task. In addition, we release the CHEMDNER silver standard corpus of automatically extracted mentions from 17,000 randomly selected PubMed abstracts. A version of the CHEMDNER corpus in the BioC format has been generated as well. We propose a standard for required minimum information about entity annotations for the construction of domain specific corpora on chemical and drug entities. The CHEMDNER corpus and annotation guidelines are available at: http://www.biocreative.org/resources/biocreative-iv/chemdner-corpus/", "evaluation_metadata": {}}, "chintagunta85/ncbi_disease": {"name": "chintagunta85/ncbi_disease", "description": "This paper presents the disease name and concept annotations of the NCBI disease corpus, a collection of 793 PubMed\nabstracts fully annotated at the mention and concept level to serve as a research resource for the biomedical natural\nlanguage processing community. Each PubMed abstract was manually annotated by two annotators with disease mentions\nand their corresponding concepts in Medical Subject Headings (MeSH\u00ae) or Online Mendelian Inheritance in Man (OMIM\u00ae).\nManual curation was performed using PubTator, which allowed the use of pre-annotations as a pre-step to manual annotations.\nFourteen annotators were randomly paired and differing annotations were discussed for reaching a consensus in two\nannotation phases. In this setting, a high inter-annotator agreement was observed. Finally, all results were checked\nagainst annotations of the rest of the corpus to assure corpus-wide consistency.\n\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3951655/\n\nThe original dataset can be downloaded from: https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/NCBI_corpus.zip\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll\nNote: there is a duplicate document (PMID 8528200) in the original data, and the duplicate is recreated in the converted data.", "evaluation_metadata": {}}, "oisinoh/tomatos": {"name": "oisinoh/tomatos", "description": "Beans is a dataset of images of beans taken in the field using smartphone\ncameras. It consists of 3 classes: 2 disease classes and the healthy class.\nDiseases depicted include Angular Leaf Spot and Bean Rust. Data was annotated\nby experts from the National Crops Resources Research Institute (NaCRRI) in\nUganda and collected by the Makerere AI research lab.", "evaluation_metadata": {}}, "commanderstrife/jnlpba": {"name": "commanderstrife/jnlpba", "description": "The data came from the GENIA version 3.02 corpus (Kim et al., 2003). This was formed from a controlled search\non MEDLINE using the MeSH terms \u0018human\u0019, \u0018blood cells\u0019 and \u0018transcription factors\u0019. From this search 2,000 abstracts\nwere selected and hand annotated according to a small taxonomy of 48 classes based on a chemical classification.\nAmong the classes, 36 terminal classes were used to annotate the GENIA corpus.", "evaluation_metadata": {}}, "DFKI-SLT/sciarg": {"name": "DFKI-SLT/sciarg", "description": "The SciArg dataset is an extension of the Dr. Inventor corpus (Fisas et al., 2015, 2016) with an annotation layer containing \nfine-grained argumentative components and relations. It is the first argument-annotated corpus of scientific \npublications (in English), which allows for joint analyses of argumentation and other rhetorical dimensions of \nscientific writing.", "evaluation_metadata": {}}, "PaddlePaddle/duconv": {"name": "PaddlePaddle/duconv", "description": "Duconv is a chinese conversation dataset, designed to evaluate the dialogue models.", "evaluation_metadata": {}}, "bigscience/xP3all": {"name": "bigscience/xP3all", "description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.", "evaluation_metadata": {}}, "joelito/lextreme": {"name": "joelito/lextreme", "description": "The LEXTREME Benchmark is a collection of multilingual datasets for evaluating model performance \nacross a diverse set of legal NLU tasks.", "evaluation_metadata": {}}, "vesteinn/sosialurin-faroese-ner": {"name": "vesteinn/sosialurin-faroese-ner", "description": "The corpus that has been created consists of ca. 100.000 words of text from the [Faroese] newspaper Sosialurin. Each word is tagged with named entity information", "evaluation_metadata": {}}, "bigscience/evaluation-results": {"name": "bigscience/evaluation-results", "description": "@misc{muennighoff2022crosslingual,\n title={Crosslingual Generalization through Multitask Finetuning}, \n author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},\n year={2022},\n eprint={2211.01786},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}", "evaluation_metadata": {}}, "tau/sled": {"name": "tau/sled", "description": "Efficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.", "evaluation_metadata": {}}, "jakartaresearch/google-play-review": {"name": "jakartaresearch/google-play-review", "description": "This dataset is built as a playground for beginner to make a use case for creating sentiment analysis model.", "evaluation_metadata": {}}, "andreagasparini/librispeech_train_clean_only": {"name": "andreagasparini/librispeech_train_clean_only", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "andreagasparini/librispeech_train_other_only": {"name": "andreagasparini/librispeech_train_other_only", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "dali-does/clevr-math": {"name": "dali-does/clevr-math", "description": "CLEVR-Math is a dataset for compositional language, visual and mathematical reasoning. CLEVR-Math poses questions about mathematical operations on visual scenes using subtraction and addition, such as \"Remove all large red cylinders. How many objects are left?\". There are also adversarial (e.g. \"Remove all blue cubes. How many cylinders are left?\") and multihop questions (e.g. \"Remove all blue cubes. Remove all small purple spheres. How many objects are left?\").", "evaluation_metadata": {}}, "jakartaresearch/indonews": {"name": "jakartaresearch/indonews", "description": "This dataset is built as a playground for beginner to make a use case for creating sentiment analysis model.", "evaluation_metadata": {}}, "jakartaresearch/poem-tweets": {"name": "jakartaresearch/poem-tweets", "description": "This dataset is built for text generation task in context of poem tweets in Bahasa.", "evaluation_metadata": {}}, "jakartaresearch/cerpen-corpus": {"name": "jakartaresearch/cerpen-corpus", "description": "This dataset is built as a playground for beginner to make a use case for creating sentiment analysis model.", "evaluation_metadata": {}}, "hoskinson-center/proof-pile": {"name": "hoskinson-center/proof-pile", "description": "A dataset of high quality mathematical text.", "evaluation_metadata": {}}, "biglam/yalta_ai_segmonto_manuscript_dataset": {"name": "biglam/yalta_ai_segmonto_manuscript_dataset", "description": "YALTAi: Segmonto Manuscript and Early Printed Book Dataset", "evaluation_metadata": {}}, "jakartaresearch/news-title-gen": {"name": "jakartaresearch/news-title-gen", "description": "This dataset is built for generating text for news title.", "evaluation_metadata": {}}, "galatolo/TeTIm-Eval": {"name": "galatolo/TeTIm-Eval", "description": "Text To Image Evaluation (TeTIm-Eval)", "evaluation_metadata": {}}, "jakartaresearch/indoqa": {"name": "jakartaresearch/indoqa", "description": "This dataset is built for question answering task.", "evaluation_metadata": {}}, "m3/multi_domain_document_classification": {"name": "m3/multi_domain_document_classification", "description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "evaluation_metadata": {}}, "jakartaresearch/id-paraphrase-detection": {"name": "jakartaresearch/id-paraphrase-detection", "description": "This dataset is built as a playground for sequence to sequence classification", "evaluation_metadata": {}}, "jakartaresearch/semeval-absa": {"name": "jakartaresearch/semeval-absa", "description": "This dataset is built as a playground for aspect-based sentiment analysis.", "evaluation_metadata": {}}, "Yaxin/SemEval2016Task5NLTK": {"name": "Yaxin/SemEval2016Task5NLTK", "description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.", "evaluation_metadata": {}}, "jonathanli/echr": {"name": "jonathanli/echr", "description": "The ECHR Cases dataset is designed for experimentation of neural judgment prediction, as in the original 2019 ACL paper \"Neural Legal Judgment Prediction in English\".", "evaluation_metadata": {}}, "cjvt/sentinews": {"name": "cjvt/sentinews", "description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).", "evaluation_metadata": {}}, "GateNLP/broad_twitter_corpus": {"name": "GateNLP/broad_twitter_corpus", "description": "This is the Broad Twitter corpus, a dataset of tweets collected over stratified times, places and social uses. \nThe goal is to represent a broad range of activities, giving a dataset more representative of the language used \nin this hardest of social media formats to process. Further, the BTC is annotated for named entities.\n\nFor more details see [https://aclanthology.org/C16-1111/](https://aclanthology.org/C16-1111/)", "evaluation_metadata": {}}, "jakartaresearch/indo-movie-subtitle": {"name": "jakartaresearch/indo-movie-subtitle", "description": "This dataset is built as a playground for analyzing text on movie subtitle", "evaluation_metadata": {}}, "MLCommons/peoples_speech": {"name": "MLCommons/peoples_speech", "description": "The People's Speech is a free-to-download 30,000-hour and growing supervised \nconversational English speech recognition dataset licensed for academic and \ncommercial usage under CC-BY-SA (with a CC-BY subset).", "evaluation_metadata": {}}, "thientran/favs_bot": {"name": "thientran/favs_bot", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": [{"col_mapping": {"ner_tags": "tags", "tokens": "tokens"}, "metrics": [{"name": "seqeval", "type": "seqeval"}], "splits": {"eval_split": "test", "train_split": "train"}, "task": "token-classification", "task_id": "entity_extraction"}]}, "cjvt/komet": {"name": "cjvt/komet", "description": "KOMET 1.0 is a hand-annotated corpus for metaphorical expressions which contains about 200,000 words from \nSlovene journalistic, fiction and on-line texts. \n\nTo annotate metaphors in the corpus an adapted and modified procedure of the MIPVU protocol \n(Steen et al., 2010: A method for linguistic metaphor identification: From MIP to MIPVU, https://www.benjamins.com/catalog/celcr.14) \nwas used. The lexical units (words) whose contextual meanings are opposed to their basic meanings are considered \nmetaphor-related words. The basic and contextual meaning for each word in the corpus was identified using the \nDictionary of the standard Slovene Language. The corpus was annotated for the metaphoric following relations: \nindirect metaphor (MRWi), direct metaphor (MRWd), borderline case (WIDLI) and metaphor signal (MFlag). \nIn addition, the corpus introduces a new 'frame' tag, which gives information about the concept to which it refers.", "evaluation_metadata": {}}, "research-backup/semeval2012_relational_similarity_v2": {"name": "research-backup/semeval2012_relational_similarity_v2", "description": "[SemEVAL 2012 task 2: Relational Similarity](https://aclanthology.org/S12-1047/)", "evaluation_metadata": {}}, "yhavinga/cnn_dailymail_dutch": {"name": "yhavinga/cnn_dailymail_dutch", "description": "CNN/DailyMail non-anonymized summarization dataset, translated to Dutch with ccmatrix.\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary", "evaluation_metadata": [{"config": "3.0.0", "task": "summarization", "task_id": "summarization", "splits": {"eval_split": "test"}, "col_mapping": {"article": "text", "highlights": "target"}}]}, "edinburghcstr/ami": {"name": "edinburghcstr/ami", "description": "The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals\nsynchronized to a common timeline. These include close-talking and far-field microphones, individual and\nroom-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings,\nthe participants also have unsynchronized pens available to them that record what is written. The meetings\nwere recorded in English using three different rooms with different acoustic properties, and include mostly\nnon-native speakers. \\n", "evaluation_metadata": {}}, "SLPL/naab": {"name": "SLPL/naab", "description": "Huge corpora of textual data are always known to be a crucial need for training deep models such as transformer-based ones. This issue is emerging more in lower resource languages - like Farsi. We propose naab, the biggest cleaned and ready-to-use open-source textual corpus in Farsi. It contains about 130GB of data, 250 million paragraphs, and 15 billion words. The project name is derived from the Farsi word \u0646\u0627\u0628 which means pure and high-grade.", "evaluation_metadata": {}}, "SLPL/naab-raw": {"name": "SLPL/naab-raw", "description": "Huge corpora of textual data are always known to be a crucial need for training deep models such as transformer-based ones. This issue is emerging more in lower resource languages - like Farsi. We propose naab, the biggest cleaned and ready-to-use open-source textual corpus in Farsi. It contains about 130GB of data, 250 million paragraphs, and 15 billion words. The project name is derived from the Farsi word \u0646\u0627\u0628 which means pure and high-grade. This corpus contains the raw (uncleaned) version of it.", "evaluation_metadata": {}}, "projecte-aina/WikiCAT_ca": {"name": "projecte-aina/WikiCAT_ca", "description": "WikiCAT: Text Classification Catalan dataset from the Viquipedia", "evaluation_metadata": {}}, "jakartaresearch/inglish": {"name": "jakartaresearch/inglish", "description": "This dataset is built as a playground for beginner to make a translation model for Indonesian and English.", "evaluation_metadata": {}}, "csebuetnlp/BanglaNMT": {"name": "csebuetnlp/BanglaNMT", "description": "This is the largest Machine Translation (MT) dataset for Bengali-English, introduced in the paper\n`Not Low-Resource Anymore: Aligner Ensembling, Batch Filtering, and New Datasets for Bengali-English Machine Translation`.", "evaluation_metadata": {}}, "yhavinga/xsum_dutch": {"name": "yhavinga/xsum_dutch", "description": "Extreme Summarization (XSum) Dataset.\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.", "evaluation_metadata": [{"config": "default", "task": "summarization", "task_id": "summarization", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"document": "text", "summary": "target"}, "metrics": [{"type": "rouge", "name": "Rouge"}]}]}, "masakhane/mafand": {"name": "masakhane/mafand", "description": "MAFAND-MT is the largest MT benchmark for African languages in the news domain, covering 21 languages. The languages covered are:\n- Amharic\n- Bambara\n- Ghomala\n- Ewe\n- Fon\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Mossi\n- Nigerian-Pidgin\n- Chichewa\n- Shona\n- Swahili\n- Setswana\n- Twi\n- Wolof\n- Xhosa\n- Yoruba\n- Zulu\n\nThe train/validation/test sets are available for 16 languages, and validation/test set for amh, kin, nya, sna, and xho\n\nFor more details see https://aclanthology.org/2022.naacl-main.223/", "evaluation_metadata": {}}, "UKPLab/TexPrax": {"name": "UKPLab/TexPrax", "description": "This dataset was collected in the [TexPrax](https://texprax.de/) project and contains named entities annotated by three researchers as well as annotated sentences (problem/P, cause/C, solution/S, and other/O).", "evaluation_metadata": {}}, "RCC-MSU/collection3": {"name": "RCC-MSU/collection3", "description": "Collection3 is a Russian dataset for named entity recognition annotated with LOC (location), PER (person), and ORG (organization) tags.\n\nDataset is based on collection Persons-1000 originally containing 1000 news documents labeled only with names of persons.\nAdditional labels were added by Valerie Mozharova and Natalia Loukachevitch.\nConversion to the IOB2 format and splitting into train, validation and test sets was done by DeepPavlov team.\n\nFor more details see https://ieeexplore.ieee.org/document/7584769 and http://labinform.ru/pub/named_entities/index.htm", "evaluation_metadata": {}}, "jonathanli/eurlex": {"name": "jonathanli/eurlex", "description": "EURLEX57K contains 57k legislative documents in English from EUR-Lex portal, annotated with EUROVOC concepts.", "evaluation_metadata": {}}, "OxAISH-AL-LLM/wiki_toxic": {"name": "OxAISH-AL-LLM/wiki_toxic", "description": "Jigsaw Toxic Comment Challenge dataset. This dataset was the basis of a Kaggle competition run by Jigsaw", "evaluation_metadata": {}}, "angelolab/ark_example": {"name": "angelolab/ark_example", "description": "This dataset contains 11 Field of Views (FOVs), each with 22 channels.", "evaluation_metadata": {}}, "nbtpj/bionlp2021MAS": {"name": "nbtpj/bionlp2021MAS", "description": "MEDIQA @ NAACL-BioNLP 2021 -- Task 2: Multi-answer summarization\n\nhttps://sites.google.com/view/mediqa2021\n\nTraining Data\n\nThe MEDIQA-AnS Dataset could be used for training.\n\nParticipants can use available external resources such as existing medical QA datasets.\n\nValidation and Test Sets\n\nThe original answers are generated by the consumer health question answering system CHiQA which searches for answers from only trustworthy medical information sources. The summaries are manually created by medical experts.\n\nThe validation set contains 192 answers associated with 50 questions. Each question has at least two answers and their summaries. For each question, we provide two types of summaries: extractive and abstractive. We encourage the use of all types of summarization approaches (extractive, abstractive, and hybrid). We will also provide the questions in the official test set as they can be used as additional inputs for the summarization models.\n\nThe test set contains 303 answers associated with 80 questions. For each test question, we provide two reference summaries: extractive and abstractive. ** In the official competiton, we used the abstractive reference summaries to evaluate the abstractive systems and the extractive summaries to evaluate the extractive systems (only extractive summaries were used on AIcrowd-Task2).", "evaluation_metadata": {}}, "BDas/ArabicNLPDataset": {"name": "BDas/ArabicNLPDataset", "description": "The dataset, prepared in Arabic, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "evaluation_metadata": {}}, "BDas/EnglishNLPDataset": {"name": "BDas/EnglishNLPDataset", "description": "The dataset, prepared in English, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "evaluation_metadata": {}}, "unpredictable/unpredictable_full": {"name": "unpredictable/unpredictable_full", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "unpredictable/unpredictable_5k": {"name": "unpredictable/unpredictable_5k", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "unpredictable/unpredictable_support-google-com": {"name": "unpredictable/unpredictable_support-google-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "unpredictable/unpredictable_unique": {"name": "unpredictable/unpredictable_unique", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "evaluation_metadata": {}}, "alexandrainst/scandi-qa": {"name": "alexandrainst/scandi-qa", "description": "ScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.", "evaluation_metadata": {}}, "evaluate/glue-ci": {"name": "evaluate/glue-ci", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": [{"config": "cola", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "sst2", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "mrpc", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "qqp", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question1": "text1", "question2": "text2", "label": "target"}}, {"config": "stsb", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "mnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation_matched"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_mismatched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_matched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "qnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "text1", "sentence": "text2", "label": "target"}}, {"config": "rte", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "wnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}]}, "opus/liv4ever": {"name": "opus/liv4ever", "description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.", "evaluation_metadata": {}}, "jamescalam/image-text-demo": {"name": "jamescalam/image-text-demo", "description": "Demo dataset for testing or showing image-text capabilities.", "evaluation_metadata": {}}, "bigbio/biosses": {"name": "bigbio/biosses", "description": "BIOSSES computes similarity of biomedical sentences by utilizing WordNet as the\ngeneral domain ontology and UMLS as the biomedical domain specific ontology.\nThe original paper outlines the approaches with respect to using annotator\nscore as golden standard. Source view will return all annotator score\nindividually whereas the Bigbio view will return the mean of the annotator\nscore.", "evaluation_metadata": {}}, "UKPLab/UKP_ASPECT": {"name": "UKPLab/UKP_ASPECT", "description": "The UKP ASPECT Corpus includes 3,595 sentence pairs over 28 controversial topics. The sentences were crawled from a large web crawl and identified as arguments for a given topic using the ArgumenText system. The sampling and matching of the sentence pairs is described in the paper. Then, the argument similarity annotation was done via crowdsourcing. Each crowd worker could choose from four annotation options (the exact guidelines are provided in the Appendix of the paper).", "evaluation_metadata": {}}, "eraldoluis/faquad": {"name": "eraldoluis/faquad", "description": "Academic secretaries and faculty members of higher education institutions face a common problem: \n the abundance of questions sent by academics \n whose answers are found in available institutional documents. \nThe official documents produced by Brazilian public universities are vast and disperse, \n which discourage students to further search for answers in such sources.\nIn order to lessen this problem, we present FaQuAD: \n a novel machine reading comprehension dataset \n in the domain of Brazilian higher education institutions. \nFaQuAD follows the format of SQuAD (Stanford Question Answering Dataset) [Rajpurkar et al. 2016]. \nIt comprises 900 questions about 249 reading passages (paragraphs), \n which were taken from 18 official documents of a computer science college \n from a Brazilian federal university \n and 21 Wikipedia articles related to Brazilian higher education system. \nAs far as we know, this is the first Portuguese reading comprehension dataset in this format.", "evaluation_metadata": [{"config": "plain_text", "task": "question-answering", "task_id": "extractive_question_answering", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "question", "context": "context", "answers": {"text": "text", "answer_start": "answer_start"}}, "metrics": [{"type": "squad", "name": "SQuAD"}]}]}, "cjvt/solar3": {"name": "cjvt/solar3", "description": "\u0160olar is a developmental corpus of 5485 school texts (e.g., essays), written by students in Slovenian secondary schools \n(age 15-19) and pupils in the 7th-9th grade of primary school (13-15), with a small percentage also from the 6th grade. \nPart of the corpus (1516 texts) is annotated with teachers' corrections using a system of labels described in the \ndocument available at https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1589/Smernice-za-oznacevanje-korpusa-Solar_V1.1.pdf (in Slovenian).", "evaluation_metadata": {}}, "SocialGrep/the-reddit-climate-change-dataset": {"name": "SocialGrep/the-reddit-climate-change-dataset", "description": "All the mentions of climate change on Reddit before Sep 1 2022.", "evaluation_metadata": {}}, "ju-resplande/rebel-pt": {"name": "ju-resplande/rebel-pt", "description": "REBEL-Portuguese is an REBEL adaptation for Portuguese.", "evaluation_metadata": {}}, "cannlytics/cannabis_tests": {"name": "cannlytics/cannabis_tests", "description": "Cannabis lab test results (https://cannlytics.com/data/results) is a\ndataset of curated cannabis lab test results.", "evaluation_metadata": {}}, "neulab/conala": {"name": "neulab/conala", "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.", "evaluation_metadata": {}}, "codesue/kelly": {"name": "codesue/kelly", "description": "The Swedish Kelly list is a freely available frequency-based vocabulary list that comprises general-purpose language of modern Swedish. The list was generated from a large web-acquired corpus (SweWaC) of 114 million words dating from the 2010s. It is adapted to the needs of language learners and contains 8,425 most frequent lemmas that cover 80% of SweWaC.\\", "evaluation_metadata": {}}, "PlanTL-GOB-ES/wnli-es": {"name": "PlanTL-GOB-ES/wnli-es", "description": "professional translation into Spanish of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).", "evaluation_metadata": {}}, "anton-l/earnings22_baseline_5_gram": {"name": "anton-l/earnings22_baseline_5_gram", "description": "\\nThe Earnings 22 dataset ( also referred to as earnings22 ) is a 119-hour corpus of English-language earnings calls collected from global companies. \nThe primary purpose is to serve as a benchmark for industrial and academic automatic speech recognition (ASR) models on real-world accented speech.", "evaluation_metadata": {}}, "skytnt/fbanimehq": {"name": "skytnt/fbanimehq", "description": "FBAnimeHQ is a dataset with high-quality full-body anime girl images in a resolution of 1024 \u00d7 512.", "evaluation_metadata": {}}, "cjvt/gkomet": {"name": "cjvt/gkomet", "description": "G-KOMET 1.0 (a corpus of metaphorical expressions in spoken Slovene language) is a corpus of speech transcriptions and \nconversations that covers 50,000 lexical units. The corpus contains samples from the Gos corpus of spoken Slovene \nand includes a balanced set of transcriptions of informative, educational, entertaining, private, and public discourse.\n\nThe annotation scheme was based on the MIPVU metaphor identification process. \nThis protocol was modified and adapted to the specifics of the Slovene language and the specifics of the spoken \nlanguage. Corpus was annotated for the following relations to metaphor: indirect metaphor, direct metaphor, borderline \ncases and metaphor signals. In addition, the corpus introduces a new \u2018frame\u2019 tag, which gives information about the \nconcept to which it refers.", "evaluation_metadata": {}}, "asapp/slue": {"name": "asapp/slue", "description": "Spoken Language Understanding Evaluation (SLUE) benchmark. There are two subsets: (i) SLUE-VoxPopuli which has ASR and NER tasks and (ii) SLUE-VoxCeleb which has ASR and SA tasks.", "evaluation_metadata": {}}, "cjvt/rsdo4_en_sl": {"name": "cjvt/rsdo4_en_sl", "description": "The RSDO4 parallel corpus of English-Slovene and Slovene-English translation pairs was collected as part of work \npackage 4 of the Slovene in the Digital Environment project. It contains texts collected from public institutions \nand texts submitted by individual donors through the text collection portal created within the project. The corpus \nconsists of 964433 translation pairs (extracted from standard translation formats (TMX, XLIFF) or manually aligned) \nin randomized order which can be used for machine translation training.", "evaluation_metadata": {}}, "THUDM/humaneval-x": {"name": "THUDM/humaneval-x", "description": "HumanEval-X is a benchmark for the evaluation of the multilingual ability of code generative models. It consists of 820 high-quality human-crafted data samples (each with test cases) in Python, C++, Java, JavaScript, and Go, and can be used for various tasks.", "evaluation_metadata": {}}, "gexai/inquisitiveqg": {"name": "gexai/inquisitiveqg", "description": "A dataset of about 20k questions that are elicited from readers as they naturally read through a document sentence by sentence. Compared to existing datasets, INQUISITIVE questions target more towards high-level (semantic and discourse) comprehension of text. Because these questions are generated while the readers are processing the information, the questions directly communicate gaps between the reader\u2019s and writer\u2019s knowledge about the events described in the text, and are not necessarily answered in the document itself. This type of question reflects a real-world scenario: if one has questions during reading, some of them are answered by the text later on, the rest are not, but any of them would help further the reader\u2019s understanding at the particular point when they asked it. This resource could enable question generation models to simulate human-like curiosity and cognitive processing, which may open up a new realm of applications.", "evaluation_metadata": {}}, "zpn/pubchem_selfies": {"name": "zpn/pubchem_selfies", "description": "This dataset contains ~100M molecules from PubChem, with their SMILES and SELFIES representations.", "evaluation_metadata": {}}, "cjvt/ssj500k": {"name": "cjvt/ssj500k", "description": "The ssj500k training corpus contains about 500 000 tokens manually annotated on the levels of tokenisation,\nsentence segmentation, morphosyntactic tagging, and lemmatisation. About half of the corpus is also manually annotated \nwith syntactic dependencies, named entities, and verbal multiword expressions. About a quarter of the corpus is also \nannotated with semantic role labels. The morphosyntactic tags and syntactic dependencies are included both in the \nJOS/MULTEXT-East framework, as well as in the framework of Universal Dependencies.", "evaluation_metadata": {}}, "EMBO/sd-character-level-ner": {"name": "EMBO/sd-character-level-ner", "description": " This dataset is based on the SourceData database and is intented to facilitate training of NLP tasks in the cell and molecualr biology domain.", "evaluation_metadata": {}}, "HuggingFaceM4/cm4-synthetic-testing": {"name": "HuggingFaceM4/cm4-synthetic-testing", "description": "This dataset is designed to be used in testing. It's derived from cm4-10k dataset", "evaluation_metadata": {}}, "bigbio/gad": {"name": "bigbio/gad", "description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database", "evaluation_metadata": {}}, "joelito/Multi_Legal_Pile": {"name": "joelito/Multi_Legal_Pile", "description": "Multi Legal Pile is a dataset of legal documents in the 24 EU languages.", "evaluation_metadata": {}}, "osbm/zenodo": {"name": "osbm/zenodo", "description": "This dataset is for downloading a Zenodo dataset without extra packages.", "evaluation_metadata": {}}, "EMBO/sd-nlp-v2": {"name": "EMBO/sd-nlp-v2", "description": " This dataset is based on the SourceData database and is intended to facilitate training of NLP tasks in the cell and molecualr biology domain.", "evaluation_metadata": {}}, "cjvt/cosimlex": {"name": "cjvt/cosimlex", "description": "The dataset contains human similarity ratings for pairs of words. The annotators were presented with contexts that \ncontained both of the words in the pair and the dataset features two different contexts per pair. The words were \nsourced from the English, Croatian, Finnish and Slovenian versions of the original Simlex dataset.", "evaluation_metadata": {}}, "severo/winogavil": {"name": "severo/winogavil", "description": "WinoGAViL is a challenging dataset for evaluating vision-and-language commonsense reasoning abilities. Given a set of images, a cue, and a number K, the task is to select the K images that best fits the association. This dataset was collected via the WinoGAViL online game to collect vision-and-language associations, (e.g., werewolves to a full moon). Inspired by the popular card game Codenames, a spymaster gives a textual cue related to several visual candidates, and another player has to identify them. Human players are rewarded for creating associations that are challenging for a rival AI model but still solvable by other human players. We evaluate several state-of-the-art vision-and-language models, finding that they are intuitive for humans (>90% Jaccard index) but challenging for state-of-the-art AI models, where the best model (ViLT) achieves a score of 52%, succeeding mostly where the cue is visually salient. Our analysis as well as the feedback we collect from players indicate that the collected associations require diverse reasoning skills, including general knowledge, common sense, abstraction, and more.", "evaluation_metadata": {}}, "jmercat/risk_biased_dataset": {"name": "jmercat/risk_biased_dataset", "description": " Dataset of pre-processed samples from a small portion of the Waymo Open Motion Data for our risk-biased prediction task.", "evaluation_metadata": {}}, "projecte-aina/UD_Catalan-AnCora": {"name": "projecte-aina/UD_Catalan-AnCora", "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).", "evaluation_metadata": {}}, "bigscience/xP3mt": {"name": "bigscience/xP3mt", "description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.", "evaluation_metadata": {}}, "nuprl/MultiPL-E": {"name": "nuprl/MultiPL-E", "description": "MultiPL-E is a dataset for evaluating large language models for code generation that supports 18 programming languages. It takes the OpenAI \"HumanEval\" and the MBPP Python benchmarks and uses little compilers to translate them to other languages. It is easy to add support for new languages and benchmarks.", "evaluation_metadata": {}}, "cannlytics/cannabis_licenses": {"name": "cannlytics/cannabis_licenses", "description": "Cannabis Licenses (https://cannlytics.com/data/licenses) is a\ndataset of curated cannabis license data. The dataset consists of 18\nsub-datasets for each state with permitted adult-use cannabis, as well\nas a sub-dataset that includes all licenses.", "evaluation_metadata": {}}, "jmhessel/newyorker_caption_contest": {"name": "jmhessel/newyorker_caption_contest", "description": "There are 3 caption contest tasks, described in the paper. In the Matching multiple choice task, models must recognize a caption written about a cartoon (vs. options that were not). In the Quality Ranking task, models must evaluate the quality\nof that caption by scoring it more highly than a lower quality option from the same contest. In the Explanation Generation task, models must explain why the joke is funny.", "evaluation_metadata": {}}, "skytnt/anime-segmentation": {"name": "skytnt/anime-segmentation", "description": "A segmentation dataset for anime character", "evaluation_metadata": {}}, "alkzar90/NIH-Chest-X-ray-dataset": {"name": "alkzar90/NIH-Chest-X-ray-dataset", "description": "The NIH Chest X-ray dataset consists of 100,000 de-identified images of chest x-rays. The images are in PNG format.\n\nThe data is provided by the NIH Clinical Center and is available through the NIH download site: https://nihcc.app.box.com/v/ChestXray-NIHCC", "evaluation_metadata": {}}, "TurkuNLP/xlsum-fi": {"name": "TurkuNLP/xlsum-fi", "description": "This dataset is a DeepL -based machine translation of a part of the English section of the XLSum dataset:[https://github.com/csebuetnlp/xl-sum](https://github.com/csebuetnlp/xl-sum) In the present version, only examples where the full version is at most 10x the summary in length are included. We might translate more later.", "evaluation_metadata": {}}, "Divyanshu/IE_SemParse": {"name": "Divyanshu/IE_SemParse", "description": " IE-SemParse is an Inter-bilingual Seq2seq Semantic parsing dataset for 11 distinct Indian languages", "evaluation_metadata": {}}, "giulio98/xlcost-formatted": {"name": "giulio98/xlcost-formatted", "description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "evaluation_metadata": {}}, "Sanatbek/uzbek-kazakh-parallel-corpora": {"name": "Sanatbek/uzbek-kazakh-parallel-corpora", "description": "This is a collection of translated sentences from Uzbek to Kazakh\n2 languages, #3,403 bitexts\ntotal number of files: #750\ntotal number of tokens: #65.54M\ntotal number of sentence fragments: #8.96M", "evaluation_metadata": {}}, "bigbio/blurb": {"name": "bigbio/blurb", "description": "The BioCreative II Gene Mention task. The training corpus for the current task consists mainly of the training and testing corpora (text collections) from the BCI task, and the testing corpus for the current task consists of an additional 5,000 sentences that were held 'in reserve' from the previous task. In the current corpus, tokenization is not provided; instead participants are asked to identify a gene mention in a sentence by giving its start and end characters. As before, the training set consists of a set of sentences, and for each sentence a set of gene mentions (GENE annotations).\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/task-1a-gene-mention-tagging/\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: Overview of BioCreative II gene mention recognition\n https://link.springer.com/article/10.1186/gb-2008-9-s2-s2", "evaluation_metadata": {}}, "khaclinh/pp4av": {"name": "khaclinh/pp4av", "description": "PP4AV is the first public dataset with faces and license plates annotated with driving scenarios. \nP4AV provides 3,447 annotated driving images for both faces and license plates. \nFor normal camera data, dataset sampled images from the existing videos in which cameras were mounted in moving vehicles, running around the European cities.\nThe images in PP4AV were sampled from 6 European cities at various times of day, including nighttime. \nThis dataset use the fisheye images from the WoodScape dataset to select 244 images from the front, rear, left, and right cameras for fisheye camera data. \nPP4AV dataset can be used as a benchmark suite (evaluating dataset) for data anonymization models in autonomous driving.", "evaluation_metadata": {}}, "shjwudp/shu": {"name": "shjwudp/shu", "description": "shu is a chinese book dataset.", "evaluation_metadata": {}}, "Tidrael/tsl_news": {"name": "Tidrael/tsl_news", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "loubnabnl/humaneval_infilling": {"name": "loubnabnl/humaneval_infilling", "description": "An evaluation benchamrk for infilling tasks on HumanEval dataset for code generation.", "evaluation_metadata": {}}, "HuggingFaceM4/general-pmd-synthetic-testing": {"name": "HuggingFaceM4/general-pmd-synthetic-testing", "description": "This dataset is designed to be used in testing. It's derived from general-pmd-10k dataset", "evaluation_metadata": {}}, "dennlinger/eur-lex-sum": {"name": "dennlinger/eur-lex-sum", "description": "The EUR-Lex-Sum dataset is a multilingual resource intended for text summarization in the legal domain.\nIt is based on human-written summaries of legal acts issued by the European Union.\nIt distinguishes itself by introducing a smaller set of high-quality human-written samples,\neach of which have much longer references (and summaries!) than comparable datasets.\nAdditionally, the underlying legal acts provide a challenging domain-specific application to legal texts,\nwhich are so far underrepresented in non-English languages.\nFor each legal act, the sample can be available in up to 24 languages\n(the officially recognized languages in the European Union);\nthe validation and test samples consist entirely of samples available in all languages,\nand are aligned across all languages at the paragraph level.", "evaluation_metadata": {}}, "bigscience/xP3": {"name": "bigscience/xP3", "description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.", "evaluation_metadata": {}}, "allenai/scirepeval_test": {"name": "allenai/scirepeval_test", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "allenai/scirepeval": {"name": "allenai/scirepeval", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "RussianNLP/tape": {"name": "RussianNLP/tape", "description": "The Winograd schema challenge composes tasks with syntactic ambiguity,\nwhich can be resolved with logic and reasoning (Levesque et al., 2012).\n\nThe texts for the Winograd schema problem are obtained using a semi-automatic \npipeline. First, lists of 11 typical grammatical structures with syntactic \nhomonymy (mainly case) are compiled. For example, two noun phrases with a \ncomplex subordinate: 'A trinket from Pompeii that has survived the centuries'.\nRequests corresponding to these constructions are submitted in search of the \nRussian National Corpus, or rather its sub-corpus with removed homonymy. In the \nresulting 2+k examples, homonymy is removed automatically with manual validation\nafterward. Each original sentence is split into multiple examples in the binary \nclassification format, indicating whether the homonymy is resolved correctly or\nnot.", "evaluation_metadata": {}}, "sled-umich/TRIP": {"name": "sled-umich/TRIP", "description": "We introduce Tiered Reasoning for Intuitive Physics (TRIP), a novel commonsense reasoning dataset with dense annotations that enable multi-tiered evaluation of machines\u2019 reasoning process.", "evaluation_metadata": {}}, "sled-umich/Action-Effect": {"name": "sled-umich/Action-Effect", "description": "Despite recent advances in knowledge representation, automated reasoning, and machine learning, artificial agents still lack the ability to understand basic action-effect relations regarding the physical world, for example, the action of cutting a cucumber most likely leads to the state where the cucumber is broken apart into smaller pieces. If artificial agents (e.g., robots) ever become our partners in joint tasks, it is critical to empower them with such action-effect understanding so that they can reason about the state of the world and plan for actions. Towards this goal, this paper introduces a new task on naive physical action-effect prediction, which addresses the relations between concrete actions (expressed in the form of verb-noun pairs) and their effects on the state of the physical world as depicted by images. We collected a dataset for this task and developed an approach that harnesses web image data through distant supervision to facilitate learning for action-effect prediction. Our empirical results have shown that web data can be used to complement a small number of seed examples (e.g., three examples for each action) for model learning. This opens up possibilities for agents to learn physical action-effect relations for tasks at hand through communication with humans with a few examples.", "evaluation_metadata": {}}, "dummy-canonical-org/dummy_canonical_dataset": {"name": "dummy-canonical-org/dummy_canonical_dataset", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "csebuetnlp/BanglaParaphrase": {"name": "csebuetnlp/BanglaParaphrase", "description": "We present a high quality bangla paraphrase dataset containing about 466k paraphrase pairs. The paraphrases ensures high quality by being semantically coherent and syntactically diverse.", "evaluation_metadata": {}}, "inmortalkaktus/pokemon-pixel-art": {"name": "inmortalkaktus/pokemon-pixel-art", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "cjvt/sloie": {"name": "cjvt/sloie", "description": "SloIE is a manually labelled dataset of Slovene idiomatic expressions. \nIt contains 29,400 sentences with 75 different expressions that can occur with either a literal or an idiomatic meaning, \nwith appropriate manual annotations for each token. The idiomatic expressions were selected from the Slovene Lexical \nDatabase (http://hdl.handle.net/11356/1030). Only expressions that can occur with both a literal and an idiomatic \nmeaning were selected. The sentences were extracted from the Gigafida corpus.", "evaluation_metadata": {}}, "elenanereiss/german-ler": {"name": "elenanereiss/german-ler", "description": "A dataset of Legal Documents from German federal court decisions for Named Entity Recognition. The dataset is human-annotated with 19 fine-grained entity classes. The dataset consists of approx. 67,000 sentences and contains 54,000 annotated entities.", "evaluation_metadata": [{"config": "conll2003", "task": "token-classification", "task_id": "entity_extraction", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "tokens", "ner_tags": "tags"}}]}, "arize-ai/beer_reviews_label_drift_neg": {"name": "arize-ai/beer_reviews_label_drift_neg", "description": "This dataset was crafted to be used in our tutorial [Link to the tutorial when\nready]. It consists on product reviews from an e-commerce store. The reviews\nare labeled on a scale from 1 to 5 (stars). The training & validation sets are\nfully composed by reviews written in english. However, the production set has\nsome reviews written in spanish. At Arize, we work to surface this issue and\nhelp you solve it.", "evaluation_metadata": {}}, "arize-ai/beer_reviews_label_drift_neutral": {"name": "arize-ai/beer_reviews_label_drift_neutral", "description": "This dataset was crafted to be used in our tutorial [Link to the tutorial when\nready]. It consists on product reviews from an e-commerce store. The reviews\nare labeled on a scale from 1 to 5 (stars). The training & validation sets are\nfully composed by reviews written in english. However, the production set has\nsome reviews written in spanish. At Arize, we work to surface this issue and\nhelp you solve it.", "evaluation_metadata": {}}, "tomasg25/scientific_lay_summarisation": {"name": "tomasg25/scientific_lay_summarisation", "description": "This repository contains the PLOS and eLife datasets, introduced in the EMNLP 2022 paper \"[Making Science Simple: Corpora for the Lay Summarisation of Scientific Literature\n](https://arxiv.org/abs/2210.09932)\". \nEach dataset contains full biomedical research articles paired with expert-written lay summaries (i.e., non-technical summaries). PLOS articles are derived from various journals published by [the Public Library of Science (PLOS)](https://plos.org/), whereas eLife articles are derived from the [eLife](https://elifesciences.org/) journal. More details/anlaysis on the content of each dataset are provided in the paper.\n\nBoth \"elife\" and \"plos\" have 6 features:\n - \"article\": the body of the document (including the abstract), sections seperated by \"/n\".\n - \"section_headings\": the title of each section, seperated by \"/n\". \n - \"keywords\": keywords describing the topic of the article, seperated by \"/n\".\n - \"title\" : the title of the article.\n - \"year\" : the year the article was published.\n - \"summary\": the lay summary of the document.", "evaluation_metadata": {}}, "research-backup/semeval2012_relational_similarity_v3": {"name": "research-backup/semeval2012_relational_similarity_v3", "description": "[SemEVAL 2012 task 2: Relational Similarity](https://aclanthology.org/S12-1047/)", "evaluation_metadata": {}}, "cjvt/slo_thesaurus": {"name": "cjvt/slo_thesaurus", "description": "This is an automatically created Slovene thesaurus from Slovene data available in a comprehensive \nEnglish\u2013Slovenian dictionary, a monolingual dictionary, and a corpus. A network analysis on the bilingual dictionary \nword co-occurrence graph was used, together with additional information from the distributional thesaurus data \navailable as part of the Sketch Engine tool and extracted from the 1.2 billion word Gigafida corpus and the \nmonolingual dictionary.", "evaluation_metadata": {}}, "cjvt/slownet": {"name": "cjvt/slownet", "description": "sloWNet is the Slovene WordNet developed in the expand approach: it contains the complete Princeton WordNet 3.0 and \nover 70 000 Slovene literals. These literals have been added automatically using different types of existing resources, \nsuch as bilingual dictionaries, parallel corpora and Wikipedia. 33 000 literals have been subsequently hand-validated.", "evaluation_metadata": {}}, "research-backup/semeval2012_relational_similarity_v4": {"name": "research-backup/semeval2012_relational_similarity_v4", "description": "[SemEVAL 2012 task 2: Relational Similarity](https://aclanthology.org/S12-1047/)", "evaluation_metadata": {}}, "sanchit-gandhi/librispeech_asr_clean": {"name": "sanchit-gandhi/librispeech_asr_clean", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "drt/kqa_pro": {"name": "drt/kqa_pro", "description": "A large-scale, diverse, challenging dataset of complex question answering over knowledge base.", "evaluation_metadata": {}}, "research-backup/semeval2012_relational_similarity_v5": {"name": "research-backup/semeval2012_relational_similarity_v5", "description": "[SemEVAL 2012 task 2: Relational Similarity](https://aclanthology.org/S12-1047/)", "evaluation_metadata": {}}, "TomTBT/pmc_open_access_figure": {"name": "TomTBT/pmc_open_access_figure", "description": "The PMC Open Access Subset includes more than 3.4 million journal articles and preprints that are made available under\nlicense terms that allow reuse.\nNot all articles in PMC are available for text mining and other reuse, many have copyright protection, however articles\nin the PMC Open Access Subset are made available under Creative Commons or similar licenses that generally allow more\nliberal redistribution and reuse than a traditional copyrighted work.\nThe PMC Open Access Subset is one part of the PMC Article Datasets\n\nThis version focus on associating the graphics of figures with their captions", "evaluation_metadata": {}}, "drt/complex_web_questions": {"name": "drt/complex_web_questions", "description": " ComplexWebQuestions is a dataset for answering complex questions that require reasoning over multiple web snippets. It contains a large set of complex questions in natural language, and can be used in multiple ways: 1) By interacting with a search engine, which is the focus of our paper (Talmor and Berant, 2018); 2) As a reading comprehension task: we release 12,725,989 web snippets that are relevant for the questions, and were collected during the development of our model; 3) As a semantic parsing task: each question is paired with a SPARQL query that can be executed against Freebase to retrieve the answer.", "evaluation_metadata": {}}, "matejklemen/vuamc": {"name": "matejklemen/vuamc", "description": "The resource contains a selection of excerpts from BNC-Baby files that have been annotated for metaphor. \nThere are four registers, each comprising about 50,000 words: academic texts, news texts, fiction, and conversations. \nWords have been separately labelled as participating in multi-word expressions (about 1.5%) or as discarded for \nmetaphor analysis (0.02%). Main categories include words that are related to metaphor (MRW), words that signal \nmetaphor (MFlag), and words that are not related to metaphor. For metaphor-related words, subdivisions have been made \nbetween clear cases of metaphor versus borderline cases (WIDLII, When In Doubt, Leave It In). Another parameter of \nmetaphor-related words makes a distinction between direct metaphor, indirect metaphor, and implicit metaphor.", "evaluation_metadata": {}}, "projecte-aina/Parafraseja": {"name": "projecte-aina/Parafraseja", "description": "Parafraseja is a dataset of 16,584 pairs of sentences with a label that indicates if they are paraphrases or not. The original sentences were collected from TE-ca and STS-ca. For each sentence, an annotator wrote a sentence that was a paraphrase and another that was not. The guidelines of this annotation are available.", "evaluation_metadata": {}}, "poloclub/diffusiondb": {"name": "poloclub/diffusiondb", "description": "DiffusionDB is the first large-scale text-to-image prompt dataset. It contains 2\nmillion images generated by Stable Diffusion using prompts and hyperparameters\nspecified by real users. The unprecedented scale and diversity of this\nhuman-actuated dataset provide exciting research opportunities in understanding\nthe interplay between prompts and generative models, detecting deepfakes, and\ndesigning human-AI interaction tools to help users more easily use these models.", "evaluation_metadata": {}}, "feradauto/MoralExceptQA": {"name": "feradauto/MoralExceptQA", "description": "We present a novel challenge set consisting of moral exception question answering (MoralExceptQA) of cases that involve potentially permissible moral exceptions.", "evaluation_metadata": {}}, "Dialogue-Model-Research-Group/v2ex": {"name": "Dialogue-Model-Research-Group/v2ex", "description": "V2EX is a dataset curated by https://www.v2ex.com/ open data.", "evaluation_metadata": {}}, "taln-ls2n/kpbiomed": {"name": "taln-ls2n/kpbiomed", "description": "KPBiomed benchmark dataset for keyphrase extraction an generation.", "evaluation_metadata": {}}, "RaphaelOlivier/whisper_adversarial_examples": {"name": "RaphaelOlivier/whisper_adversarial_examples", "description": "Adversarial examples fooling whisper models", "evaluation_metadata": {}}, "SALT-NLP/wikisql_VALUE": {"name": "SALT-NLP/wikisql_VALUE", "description": "A large crowd-sourced dataset for developing natural language interfaces for relational databases", "evaluation_metadata": {}}, "AmazonScience/mintaka": {"name": "AmazonScience/mintaka", "description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers.", "evaluation_metadata": {}}, "SALT-NLP/spider_VALUE": {"name": "SALT-NLP/spider_VALUE", "description": "Spider is a large-scale complex and cross-domain semantic parsing and text-toSQL dataset annotated by 11 college students", "evaluation_metadata": {}}, "havens2/naacl2022": {"name": "havens2/naacl2022", "description": "NACL22 is a dataset labelled for Science Entity Recognition task, which is a subtask of NER task. \nThe text is from 2022 conference papers collected from ACL anthology. \nThe dataset is collected by Haotian Teng and Xiaoyue Cui. \nAnnotation standard can be found here https://github.com/neubig/nlp-from-scratch-assignment-2022/blob/main/annotation_standard.md", "evaluation_metadata": {}}, "PlanTL-GOB-ES/UD_Spanish-AnCora": {"name": "PlanTL-GOB-ES/UD_Spanish-AnCora", "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).", "evaluation_metadata": {}}, "PlanTL-GOB-ES/CoNLL-NERC-es": {"name": "PlanTL-GOB-ES/CoNLL-NERC-es", "description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\n\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\n\nThe train/validation/test sets are available in Spanish and Dutch.\n\nFor more details see https://www.clips.uantwerpen.be/conll2002/ner/ and https://www.aclweb.org/anthology/W02-2024/", "evaluation_metadata": {}}, "severo/glue": {"name": "severo/glue", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": [{"config": "cola", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "sst2", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "mrpc", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "qqp", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question1": "text1", "question2": "text2", "label": "target"}}, {"config": "stsb", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "mnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation_matched"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_mismatched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_matched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "qnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "text1", "sentence": "text2", "label": "target"}}, {"config": "rte", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "wnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}]}, "lmqg/qg_annotation": {"name": "lmqg/qg_annotation", "description": "Human-annotated question generated by models.", "evaluation_metadata": {}}, "society-ethics/lila_camera_traps": {"name": "society-ethics/lila_camera_traps", "description": "LILA Camera Traps is an aggregate data set of images taken by camera traps, which are devices that automatically (e.g. via motion detection) capture images of wild animals to help ecological research.\n\nThis data set is the first time when disparate camera trap data sets have been aggregated into a single training environment with a single taxonomy.\n\nThis data set consists of only camera trap image data sets, whereas the broader LILA website also has other data sets related to biology and conservation, intended as a resource for both machine learning (ML) researchers and those that want to harness ML for this topic.", "evaluation_metadata": {}}, "corentinm7/MyoQuant-SDH-Data": {"name": "corentinm7/MyoQuant-SDH-Data", "description": "This dataset is used to train the SDH model of MyoQuant to detect and quantify anomaly in the mitochondria repartition in SDH stained muscle fiber with myopathy disorders.", "evaluation_metadata": {}}, "nbtpj/BioNLP2021": {"name": "nbtpj/BioNLP2021", "description": "MEDIQA @ NAACL-BioNLP 2021 -- Task 2: Multi-answer summarization\nhttps://sites.google.com/view/mediqa2021\nBiomedical Summarization Data\nThe MEDIQA-AnS Dataset could be used for training.", "evaluation_metadata": {}}, "qanastek/HoC": {"name": "qanastek/HoC", "description": "The Hallmarks of Cancer Corpus for text classification\n\nThe Hallmarks of Cancer (HOC) Corpus consists of 1852 PubMed\npublication abstracts manually annotated by experts according\nto a taxonomy. The taxonomy consists of 37 classes in a\nhierarchy. Zero or more class labels are assigned to each\nsentence in the corpus. The labels are found under the \"labels\"\ndirectory, while the tokenized text can be found under \"text\"\ndirectory. The filenames are the corresponding PubMed IDs (PMID).\n\nIn addition to the HOC corpus, we also have the\n[Cancer Hallmarks Analytics Tool](http://chat.lionproject.net/)\nwhich classifes all of PubMed according to the HoC taxonomy.", "evaluation_metadata": {}}, "GEM/TaTA": {"name": "GEM/TaTA", "description": "Dataset loader for TaTA: A Multilingual Table-to-Text Dataset for African Languages", "evaluation_metadata": {}}, "ficsort/SzegedNER": {"name": "ficsort/SzegedNER", "description": "The recognition and classification of proper nouns and names in plain text is of key importance in Natural Language \nProcessing (NLP) as it has a beneficial effect on the performance of various types of applications, including \nInformation Extraction, Machine Translation, Syntactic Parsing/Chunking, etc.", "evaluation_metadata": {}}, "arbml/adawat": {"name": "arbml/adawat", "description": "Masader is the largest public catalogue for Arabic NLP datasets, which consists of more than 200 datasets annotated with 25 attributes.", "evaluation_metadata": {}}, "allenai/csabstruct": {"name": "allenai/csabstruct", "description": "As a step toward better document-level understanding, we explore classification of a sequence of sentences into their corresponding categories, a task that requires understanding sentences in context of the document. Recent successful models for this task have used hierarchical models to contextualize sentence representations, and Conditional Random Fields (CRFs) to incorporate dependencies between subsequent labels. In this work, we show that pretrained language models, BERT (Devlin et al., 2018) in particular, can be used for this task to capture contextual dependencies without the need for hierarchical encoding nor a CRF. Specifically, we construct a joint sentence representation that allows BERT Transformer layers to directly utilize contextual information from all words in all sentences. Our approach achieves state-of-the-art results on four datasets, including a new dataset of structured scientific abstracts.", "evaluation_metadata": {}}, "shunk031/cocostuff": {"name": "shunk031/cocostuff", "description": "COCO-Stuff augments all 164K images of the popular COCO dataset with pixel-level stuff annotations. These annotations can be used for scene understanding tasks like semantic segmentation, object detection and image captioning.", "evaluation_metadata": {}}, "SALT-NLP/MIC": {"name": "SALT-NLP/MIC", "description": "MIC is a resource for understanding the intuitions, values and moral judgments reflected in the utterances of dialogue systems", "evaluation_metadata": {}}, "LeandraFichtel/KAMEL": {"name": "LeandraFichtel/KAMEL", "description": "This dataset provides the data for KAMEL, a probing dataset for language models that contains factual knowledge\nfrom Wikidata and Wikipedia..", "evaluation_metadata": {}}, "sileod/probability_words_nli": {"name": "sileod/probability_words_nli", "description": "Probing neural language models for understanding of words of estimative probability", "evaluation_metadata": [{"config": "usnli", "task": "text-classification", "task_id": "multi-class-classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "context", "sentence2": "hypothesis", "label": "label"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary"}]}, {"config": "reasoning-1hop", "task": "text-classification", "task_id": "multi-class-classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "context", "sentence2": "hypothesis", "label": "label"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary"}]}, {"config": "reasoning-2hop", "task": "text-classification", "task_id": "multi-class-classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "context", "sentence2": "hypothesis", "label": "label"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 binary"}]}]}, "lmqg/qa_harvesting_from_wikipedia": {"name": "lmqg/qa_harvesting_from_wikipedia", "description": "QA pairs generated in https://aclanthology.org/P18-1177/", "evaluation_metadata": {}}, "lmqg/qa_squadshifts": {"name": "lmqg/qa_squadshifts", "description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "evaluation_metadata": {}}, "qanastek/Biosses-BLUE": {"name": "qanastek/Biosses-BLUE", "description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation.\nThe dataset comprises 100 sentence pairs, in which each sentence was selected\nfrom the TAC (Text Analysis Conference) Biomedical Summarization Track Training\nDataset containing articles from the biomedical domain. The sentence pairs in\nBIOSSES were selected from citing sentences, i.e. sentences that have a citation\nto a reference article.\n\nThe sentence pairs were evaluated by five different human experts that judged\ntheir similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).\nIn the original paper the mean of the scores assigned by the five human annotators\nwas taken as the gold standard. The Pearson correlation between the gold standard\nscores and the scores estimated by the models was used as the evaluation metric.\nThe strength of correlation can be assessed by the general guideline proposed by\nEvans (1996) as follows:\n\nvery strong: 0.80\u20131.00\nstrong: 0.60\u20130.79\nmoderate: 0.40\u20130.59\nweak: 0.20\u20130.39\nvery weak: 0.00\u20130.19", "evaluation_metadata": {}}, "jpwahle/dblp-discovery-dataset": {"name": "jpwahle/dblp-discovery-dataset", "description": "This repository provides metadata to papers from DBLP.", "evaluation_metadata": {}}, "alkzar90/cell_benchmark": {"name": "alkzar90/cell_benchmark", "description": "A segmentation dataset for [TODO: complete...]", "evaluation_metadata": {}}, "jinhybr/WildReceipt": {"name": "jinhybr/WildReceipt", "description": "WildReceipt is a collection of receipts. It contains, for each photo, a list of OCRs - with the bounding box, text, and class. It contains 1765 photos, with 25 classes, and 50000 text boxes. The goal is to benchmark \"key information extraction\" - extracting key information from documents\nhttps://arxiv.org/abs/2103.14470", "evaluation_metadata": {}}, "Conrad747/lg-ner": {"name": "Conrad747/lg-ner", "description": "LugandaPII is a named entity dataset consisting of PERSON, ORG, LOCATION, NORP, USERID and DATE entities.\nThe train/validation/test sets are available for the Luganda language.", "evaluation_metadata": {}}, "KETI-AIR/vqa": {"name": "KETI-AIR/vqa", "description": "# VQA\n\n## What is VQA?\nVQA is a new dataset containing open-ended questions about images. These questions require an understanding of vision, language and commonsense knowledge to answer.\n- 265,016 images (COCO and abstract scenes)\n- At least 3 questions (5.4 questions on average) per image\n- 10 ground truth answers per question\n- 3 plausible (but likely incorrect) answers per question\n- Automatic evaluation metric\n\n## Dataset\nDetails on downloading the latest dataset may be found on the [download webpage](https://visualqa.org/download.html).\n\n## Usage\n```python\nfrom datasets import load_dataset\n\nraw_datasets = load_dataset(\n \"vqa.py\", \n \"base\",\n cache_dir=\"huggingface_datasets\", \n data_dir=\"data\",\n ignore_verifications=True,\n )\n\ndataset_train = raw_datasets[\"train\"]\n\nfor item in dataset_train:\n print(item)\n exit()\n```\n\nv2 = v2.real + v2.abstract (v2.abstract == v1.abstract)\nv1 = v1.real + v1.abstract\nv2.abstract.balanced.bin", "evaluation_metadata": {}}, "bishalbaaniya/my_en": {"name": "bishalbaaniya/my_en", "description": "A parallel corpus of KDE4 localization files (v.2).\n92 languages, 4,099 bitexts\ntotal number of files: 75,535\ntotal number of tokens: 60.75M\ntotal number of sentence fragments: 8.89M", "evaluation_metadata": {}}, "Genius1237/TyDiP": {"name": "Genius1237/TyDiP", "description": "The TyDiP dataset is a dataset of requests in conversations between wikipedia editors\nthat have been annotated for politeness. The splits available below consists of only\nrequests from the top 25 percentile (polite) and bottom 25 percentile (impolite) of\npoliteness scores. The English train set and English test set that are\nadapted from the Stanford Politeness Corpus, and test data in 9 more languages\n(Hindi, Korean, Spanish, Tamil, French, Vietnamese, Russian, Afrikaans, Hungarian) \nwas annotated by us.", "evaluation_metadata": {}}, "lmqg/qag_tweetqa": {"name": "lmqg/qag_tweetqa", "description": "Question & answer generation dataset based on [TweetQA](https://huggingface.co/datasets/tweet_qa).", "evaluation_metadata": {}}, "lmqg/qag_squad": {"name": "lmqg/qag_squad", "description": "Question & answer generation dataset based on SQuAD.", "evaluation_metadata": {}}, "lawcompany/KLAID": {"name": "lawcompany/KLAID", "description": "KLAID (Korean Legal Artificial Intelligence Datasets) is a dataset for the development of Korean legal artificial intelligence technology. This time we offer 1 task, which is legal judgment prediction(LJP).", "evaluation_metadata": {}}, "bigbio/an_em": {"name": "bigbio/an_em", "description": "AnEM corpus is a domain- and species-independent resource manually annotated for anatomical\nentity mentions using a fine-grained classification system. The corpus consists of 500 documents\n(over 90,000 words) selected randomly from citation abstracts and full-text papers with\nthe aim of making the corpus representative of the entire available biomedical scientific\nliterature. The corpus annotation covers mentions of both healthy and pathological anatomical\nentities and contains over 3,000 annotated mentions.", "evaluation_metadata": {}}, "bigbio/anat_em": {"name": "bigbio/anat_em", "description": "The extended Anatomical Entity Mention corpus (AnatEM) consists of 1212 documents (approx. 250,000 words) manually annotated to identify over 13,000 mentions of anatomical entities. Each annotation is assigned one of 12 granularity-based types such as Cellular component, Tissue and Organ, defined with reference to the Common Anatomy Reference Ontology.", "evaluation_metadata": {}}, "bigbio/ask_a_patient": {"name": "bigbio/ask_a_patient", "description": "The AskAPatient dataset contains medical concepts written on social media mapped to how they are formally written in medical ontologies (SNOMED-CT and AMT).", "evaluation_metadata": {}}, "bigbio/bc7_litcovid": {"name": "bigbio/bc7_litcovid", "description": "The training and development datasets contain the publicly-available text of over 30 thousand COVID-19-related articles and their metadata (e.g., title, abstract, journal). Articles in both datasets have been manually reviewed and articles annotated by in-house models.", "evaluation_metadata": {}}, "bigbio/bio_sim_verb": {"name": "bigbio/bio_sim_verb", "description": "This repository contains the evaluation datasets for the paper Bio-SimVerb and Bio-SimLex: Wide-coverage Evaluation Sets of Word Similarity in Biomedicine by Billy Chiu, Sampo Pyysalo and Anna Korhonen.", "evaluation_metadata": {}}, "bigbio/bio_simlex": {"name": "bigbio/bio_simlex", "description": "Bio-SimLex enables intrinsic evaluation of word representations. This evaluation can serve as a predictor of performance on various downstream tasks in the biomedical domain. The results on Bio-SimLex using standard word representation models highlight the importance of developing dedicated evaluation resources for NLP in biomedicine for particular word classes (e.g. verbs).", "evaluation_metadata": {}}, "bigbio/bioasq_2021_mesinesp": {"name": "bigbio/bioasq_2021_mesinesp", "description": "The main aim of MESINESP2 is to promote the development of practically relevant semantic indexing tools for biomedical content in non-English language. We have generated a manually annotated corpus, where domain experts have labeled a set of scientific literature, clinical trials, and patent abstracts. All the documents were labeled with DeCS descriptors, which is a structured controlled vocabulary created by BIREME to index scientific publications on BvSalud, the largest database of scientific documents in Spanish, which hosts records from the databases LILACS, MEDLINE, IBECS, among others.\n\nMESINESP track at BioASQ9 explores the efficiency of systems for assigning DeCS to different types of biomedical documents. To that purpose, we have divided the task into three subtracks depending on the document type. Then, for each one we generated an annotated corpus which was provided to participating teams:\n\n- [Subtrack 1 corpus] MESINESP-L \u2013 Scientific Literature: It contains all Spanish records from LILACS and IBECS databases at the Virtual Health Library (VHL) with non-empty abstract written in Spanish.\n- [Subtrack 2 corpus] MESINESP-T- Clinical Trials contains records from Registro Espa\u00f1ol de Estudios Cl\u00ednicos (REEC). REEC doesn't provide documents with the structure title/abstract needed in BioASQ, for that reason we have built artificial abstracts based on the content available in the data crawled using the REEC API.\n- [Subtrack 3 corpus] MESINESP-P \u2013 Patents: This corpus includes patents in Spanish extracted from Google Patents which have the IPC code \u201cA61P\u201d and \u201cA61K31\u201d. In addition, we also provide a set of complementary data such as: the DeCS terminology file, a silver standard with the participants' predictions to the task background set and the entities of medications, diseases, symptoms and medical procedures extracted from the BSC NERs documents.", "evaluation_metadata": {}}, "bigbio/bioinfer": {"name": "bigbio/bioinfer", "description": "A corpus targeted at protein, gene, and RNA relationships which serves as a\nresource for the development of information extraction systems and their\ncomponents such as parsers and domain analyzers. Currently, the corpus contains\n1100 sentences from abstracts of biomedical research articles annotated for\nrelationships, named entities, as well as syntactic dependencies.", "evaluation_metadata": {}}, "bigbio/biology_how_why_corpus": {"name": "bigbio/biology_how_why_corpus", "description": "This dataset consists of 185 \"how\" and 193 \"why\" biology questions authored by a domain expert, with one or more gold \nanswer passages identified in an undergraduate textbook. The expert was not constrained in any way during the \nannotation process, so gold answers might be smaller than a paragraph or span multiple paragraphs. This dataset was \nused for the question-answering system described in the paper \u201cDiscourse Complements Lexical Semantics for Non-factoid \nAnswer Reranking\u201d (ACL 2014).", "evaluation_metadata": {}}, "bigbio/biomrc": {"name": "bigbio/biomrc", "description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.", "evaluation_metadata": {}}, "bigbio/bionlp_shared_task_2009": {"name": "bigbio/bionlp_shared_task_2009", "description": "The BioNLP Shared Task 2009 was organized by GENIA Project and its corpora were curated based\non the annotations of the publicly available GENIA Event corpus and an unreleased (blind) section\nof the GENIA Event corpus annotations, used for evaluation.", "evaluation_metadata": {}}, "bigbio/bionlp_st_2011_epi": {"name": "bigbio/bionlp_st_2011_epi", "description": "The dataset of the Epigenetics and Post-translational Modifications (EPI) task\nof BioNLP Shared Task 2011.", "evaluation_metadata": {}}, "bigbio/bionlp_st_2011_ge": {"name": "bigbio/bionlp_st_2011_ge", "description": "The BioNLP-ST GE task has been promoting development of fine-grained information extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of NFkB as a model domain of Biomedical IE.\nThe GENIA task aims at extracting events occurring upon genes or gene products, which are typed as \"Protein\"\nwithout differentiating genes from gene products. Other types of physical entities, e.g. cells, cell components,\nare not differentiated from each other, and their type is given as \"Entity\".", "evaluation_metadata": {}}, "bigbio/bionlp_st_2011_id": {"name": "bigbio/bionlp_st_2011_id", "description": "The dataset of the Infectious Diseases (ID) task of\nBioNLP Shared Task 2011.", "evaluation_metadata": {}}, "bigbio/bionlp_st_2011_rel": {"name": "bigbio/bionlp_st_2011_rel", "description": "The Entity Relations (REL) task is a supporting task of the BioNLP Shared Task 2011.\nThe task concerns the extraction of two types of part-of relations between a\ngene/protein and an associated entity.", "evaluation_metadata": {}}, "bigbio/bionlp_st_2013_cg": {"name": "bigbio/bionlp_st_2013_cg", "description": "the Cancer Genetics (CG) is a event extraction task and a main task of the BioNLP Shared Task (ST) 2013.\nThe CG task is an information extraction task targeting the recognition of events in text,\nrepresented as structured n-ary associations of given physical entities. In addition to\naddressing the cancer domain, the CG task is differentiated from previous event extraction\ntasks in the BioNLP ST series in addressing a wide range of pathological processes and multiple\nlevels of biological organization, ranging from the molecular through the cellular and organ\nlevels up to whole organisms. Final test set submissions were accepted from six teams", "evaluation_metadata": {}}, "bigbio/bionlp_st_2013_ge": {"name": "bigbio/bionlp_st_2013_ge", "description": "The BioNLP-ST GE task has been promoting development of fine-grained\ninformation extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of\nNFkB as a model domain of Biomedical IE", "evaluation_metadata": {}}, "bigbio/bionlp_st_2013_gro": {"name": "bigbio/bionlp_st_2013_gro", "description": "GRO Task: Populating the Gene Regulation Ontology with events and\nrelations. A data set from the bio NLP shared tasks competition from 2013", "evaluation_metadata": {}}, "bigbio/bionlp_st_2013_pc": {"name": "bigbio/bionlp_st_2013_pc", "description": "the Pathway Curation (PC) task is a main event extraction task of the BioNLP shared task (ST) 2013.\nThe PC task concerns the automatic extraction of biomolecular reactions from text.\nThe task setting, representation and semantics are defined with respect to pathway\nmodel standards and ontologies (SBML, BioPAX, SBO) and documents selected by relevance\nto specific model reactions. Two BioNLP ST 2013 participants successfully completed\nthe PC task. The highest achieved F-score, 52.8%, indicates that event extraction is\na promising approach to supporting pathway curation efforts.", "evaluation_metadata": {}}, "bigbio/bionlp_st_2019_bb": {"name": "bigbio/bionlp_st_2019_bb", "description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.", "evaluation_metadata": {}}, "bigbio/biorelex": {"name": "bigbio/biorelex", "description": "BioRelEx is a biological relation extraction dataset. Version 1.0 contains 2010\nannotated sentences that describe binding interactions between various\nbiological entities (proteins, chemicals, etc.). 1405 sentences are for\ntraining, another 201 sentences are for validation. They are publicly available\nat https://github.com/YerevaNN/BioRelEx/releases. Another 404 sentences are for\ntesting which are kept private for at this Codalab competition\nhttps://competitions.codalab.org/competitions/20468. All sentences contain words\n\"bind\", \"bound\" or \"binding\". For every sentence we provide: 1) Complete\nannotations of all biological entities that appear in the sentence 2) Entity\ntypes (32 types) and grounding information for most of the proteins and families\n(links to uniprot, interpro and other databases) 3) Coreference between entities\nin the same sentence (e.g. abbreviations and synonyms) 4) Binding interactions\nbetween the annotated entities 5) Binding interaction types: positive, negative\n(A does not bind B) and neutral (A may bind to B)", "evaluation_metadata": {}}, "bigbio/cantemist": {"name": "bigbio/cantemist", "description": "Collection of 1301 oncological clinical case reports written in Spanish, with tumor morphology mentions manually annotated and mapped by clinical experts to a controlled terminology. Every tumor morphology mention is linked to an eCIE-O code (the Spanish equivalent of ICD-O).\n\nThe original dataset is distributed in Brat format, and was randomly sampled into 3 subsets. The training, development and test sets contain 501, 500 and 300 documents each, respectively.\n\nThis dataset was designed for the CANcer TExt Mining Shared Task, sponsored by Plan-TL. The task is divided in 3 subtasks: CANTEMIST-NER, CANTEMIST_NORM and CANTEMIST-CODING.\n\nCANTEMIST-NER track: requires finding automatically tumor morphology mentions. All tumor morphology mentions are defined by their corresponding character offsets in UTF-8 plain text medical documents. \n\nCANTEMIST-NORM track: clinical concept normalization or named entity normalization task that requires to return all tumor morphology entity mentions together with their corresponding eCIE-O-3.1 codes i.e. finding and normalizing tumor morphology mentions.\n\nCANTEMIST-CODING track: requires returning for each of document a ranked list of its corresponding ICD-O-3 codes. This it is essentially a sort of indexing or multi-label classification task or oncology clinical coding. \n\nFor further information, please visit https://temu.bsc.es/cantemist or send an email to encargo-pln-life@bsc.es", "evaluation_metadata": {}}, "bigbio/cellfinder": {"name": "bigbio/cellfinder", "description": "The CellFinder project aims to create a stem cell data repository by linking information from existing public databases and by performing text mining on the research literature. The first version of the corpus is composed of 10 full text documents containing more than 2,100 sentences, 65,000 tokens and 5,200 annotations for entities. The corpus has been annotated with six types of entities (anatomical parts, cell components, cell lines, cell types, genes/protein and species) with an overall inter-annotator agreement around 80%.\n\nSee: https://www.informatik.hu-berlin.de/de/forschung/gebiete/wbi/resources/cellfinder/", "evaluation_metadata": {}}, "bigbio/chebi_nactem": {"name": "bigbio/chebi_nactem", "description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.", "evaluation_metadata": {}}, "bigbio/chemprot": {"name": "bigbio/chemprot", "description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.", "evaluation_metadata": {}}, "bigbio/chia": {"name": "bigbio/chia", "description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.", "evaluation_metadata": {}}, "bigbio/citation_gia_test_collection": {"name": "bigbio/citation_gia_test_collection", "description": "The Citation GIA Test Collection was recently created for gene indexing at the\nNLM and includes 151 PubMed abstracts with both mention-level and document-level\nannotations. They are selected because both have a focus on human genes.", "evaluation_metadata": {}}, "bigbio/codiesp": {"name": "bigbio/codiesp", "description": "Synthetic corpus of 1,000 manually selected clinical case studies in Spanish\nthat was designed for the Clinical Case Coding in Spanish Shared Task, as part\nof the CLEF 2020 conference.\n\nThe goal of the task was to automatically assign ICD10 codes (CIE-10, in\nSpanish) to clinical case documents, being evaluated against manually generated\nICD10 codifications. The CodiEsp corpus was selected manually by practicing\nphysicians and clinical documentalists and annotated by clinical coding\nprofessionals meeting strict quality criteria. They reached an inter-annotator\nagreement of 88.6% for diagnosis coding, 88.9% for procedure coding and 80.5%\nfor the textual reference annotation.\n\nThe final collection of 1,000 clinical cases that make up the corpus had a total\nof 16,504 sentences and 396,988 words. All documents are in Spanish language and\nCIE10 is the coding terminology (the Spanish version of ICD10-CM and ICD10-PCS).\nThe CodiEsp corpus has been randomly sampled into three subsets. The train set\ncontains 500 clinical cases, while the development and test sets have 250\nclinical cases each. In addition to these, a collection of 176,294 abstracts\nfrom Lilacs and Ibecs with the corresponding ICD10 codes (ICD10-CM and\nICD10-PCS) was provided by the task organizers. Every abstract has at least one\nassociated code, with an average of 2.5 ICD10 codes per abstract.\n\nThe CodiEsp track was divided into three sub-tracks (2 main and 1 exploratory):\n\n- CodiEsp-D: The Diagnosis Coding sub-task, which requires automatic ICD10-CM\n [CIE10-Diagn\u00f3stico] code assignment.\n- CodiEsp-P: The Procedure Coding sub-task, which requires automatic ICD10-PCS\n [CIE10-Procedimiento] code assignment.\n- CodiEsp-X: The Explainable AI exploratory sub-task, which requires to submit\n the reference to the predicted codes (both ICD10-CM and ICD10-PCS). The goal \n of this novel task was not only to predict the correct codes but also to \n present the reference in the text that supports the code predictions.\n\nFor further information, please visit https://temu.bsc.es/codiesp or send an\nemail to encargo-pln-life@bsc.es", "evaluation_metadata": {}}, "bigbio/ctebmsp": {"name": "bigbio/ctebmsp", "description": "The \"abstracts\" subset of the Clinical Trials for Evidence-Based Medicine in Spanish\n(CT-EBM-SP) corpus contains 500 abstracts of clinical trial studies in Spanish,\npublished in journals with a Creative Commons license. Most were downloaded from\nthe SciELO repository and free abstracts in PubMed.\n\nAbstracts were retrieved with the query:\nClinical Trial[ptyp] AND \u201cloattrfree full text\u201d[sb] AND \u201cspanish\u201d[la].\n\n(Information collected from 10.1186/s12911-021-01395-z)", "evaluation_metadata": {}}, "bigbio/ddi_corpus": {"name": "bigbio/ddi_corpus", "description": "The DDI corpus has been manually annotated with drugs and pharmacokinetics and pharmacodynamics interactions. It contains 1025 documents from two different sources: DrugBank database and MedLine.", "evaluation_metadata": {}}, "bigbio/distemist": {"name": "bigbio/distemist", "description": "The DisTEMIST corpus is a collection of 1000 clinical cases with disease annotations linked with Snomed-CT concepts.\nAll documents are released in the context of the BioASQ DisTEMIST track for CLEF 2022.", "evaluation_metadata": {}}, "bigbio/ehr_rel": {"name": "bigbio/ehr_rel", "description": "EHR-Rel is a novel open-source1 biomedical concept relatedness dataset consisting of 3630 concept pairs, six times more\nthan the largest existing dataset. Instead of manually selecting and pairing concepts as done in previous work,\nthe dataset is sampled from EHRs to ensure concepts are relevant for the EHR concept retrieval task.\nA detailed analysis of the concepts in the dataset reveals a far larger coverage compared to existing datasets.", "evaluation_metadata": {}}, "bigbio/euadr": {"name": "bigbio/euadr", "description": "Corpora with specific entities and relationships annotated are essential to train and evaluate text-mining systems that are developed to extract specific structured information from a large corpus. In this paper we describe an approach where a named-entity recognition system produces a first annotation and annotators revise this annotation using a web-based interface. The agreement figures achieved show that the inter-annotator agreement is much better than the agreement with the system provided annotations. The corpus has been annotated for drugs, disorders, genes and their inter-relationships. For each of the drug-disorder, drug-target, and target-disorder relations three experts have annotated a set of 100 abstracts. These annotated relationships will be used to train and evaluate text-mining software to capture these relationships in texts.", "evaluation_metadata": {}}, "bigbio/evidence_inference": {"name": "bigbio/evidence_inference", "description": "The dataset consists of biomedical articles describing randomized control trials (RCTs) that compare multiple\ntreatments. Each of these articles will have multiple questions, or 'prompts' associated with them.\nThese prompts will ask about the relationship between an intervention and comparator with respect to an outcome,\nas reported in the trial. For example, a prompt may ask about the reported effects of aspirin as compared\nto placebo on the duration of headaches. For the sake of this task, we assume that a particular article\nwill report that the intervention of interest either significantly increased, significantly decreased\nor had significant effect on the outcome, relative to the comparator.", "evaluation_metadata": {}}, "bigbio/genetag": {"name": "bigbio/genetag", "description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..", "evaluation_metadata": {}}, "bigbio/genia_ptm_event_corpus": {"name": "bigbio/genia_ptm_event_corpus", "description": "Post-translational-modi\ufb01cations (PTM), amino acid modi\ufb01cations of proteins after translation, are one of the posterior processes of protein biosynthesis for many proteins, and they are critical for determining protein function such as its activity state, localization, turnover and interactions with other biomolecules. While there have been many studies of information extraction targeting individual PTM types, there was until recently little effort to address extraction of multiple PTM types at once in a unified framework.", "evaluation_metadata": {}}, "bigbio/genia_relation_corpus": {"name": "bigbio/genia_relation_corpus", "description": "The extraction of various relations stated to hold between biomolecular entities is one of the most frequently\naddressed information extraction tasks in domain studies. Typical relation extraction targets involve protein-protein\ninteractions or gene regulatory relations. However, in the GENIA corpus, such associations involving change in the\nstate or properties of biomolecules are captured in the event annotation.\n\nThe GENIA corpus relation annotation aims to complement the event annotation of the corpus by capturing (primarily)\nstatic relations, relations such as part-of that hold between entities without (necessarily) involving change.", "evaluation_metadata": {}}, "bigbio/genia_term_corpus": {"name": "bigbio/genia_term_corpus", "description": "The identification of linguistic expressions referring to entities of interest in molecular biology such as proteins,\ngenes and cells is a fundamental task in biomolecular text mining. The GENIA technical term annotation covers the\nidentification of physical biological entities as well as other important terms. The corpus annotation covers the full\n1,999 abstracts of the primary GENIA corpus.", "evaluation_metadata": {}}, "bigbio/geokhoj_v1": {"name": "bigbio/geokhoj_v1", "description": "GEOKhoj v1 is a annotated corpus of control/perturbation labels for 30,000 samples\nfrom Microarray, Transcriptomics and Single cell experiments which are available on\nthe GEO (Gene Expression Omnibus) database", "evaluation_metadata": {}}, "bigbio/hallmarks_of_cancer": {"name": "bigbio/hallmarks_of_cancer", "description": "The Hallmarks of Cancer (HOC) Corpus consists of 1852 PubMed publication\nabstracts manually annotated by experts according to a taxonomy. The taxonomy\nconsists of 37 classes in a hierarchy. Zero or more class labels are assigned\nto each sentence in the corpus. The labels are found under the \"labels\"\ndirectory, while the tokenized text can be found under \"text\" directory.\nThe filenames are the corresponding PubMed IDs (PMID).", "evaluation_metadata": {}}, "bigbio/hprd50": {"name": "bigbio/hprd50", "description": "HPRD50 is a dataset of randomly selected, hand-annotated abstracts of biomedical papers\nreferenced by the Human Protein Reference Database (HPRD). It is parsed in XML format,\nsplitting each abstract into sentences, and in each sentence there may be entities and\ninteractions between those entities. In this particular dataset, entities are all\nproteins and interactions are thus protein-protein interactions.\n\nMoreover, all entities are normalized to the HPRD database. These normalized terms are\nstored in each entity's 'type' attribute in the source XML. This means the dataset can\ndetermine e.g. that \"Janus kinase 2\" and \"Jak2\" are referencing the same normalized\nentity.\n\nBecause the dataset contains entities and relations, it is suitable for Named Entity\nRecognition and Relation Extraction.", "evaluation_metadata": {}}, "bigbio/iepa": {"name": "bigbio/iepa", "description": "The IEPA benchmark PPI corpus is designed for relation extraction. It was created from 303 PubMed abstracts, each of which contains a specific pair of co-occurring chemicals.", "evaluation_metadata": {}}, "bigbio/linnaeus": {"name": "bigbio/linnaeus", "description": "Linnaeus is a novel corpus of full-text documents manually annotated for species mentions.", "evaluation_metadata": {}}, "bigbio/lll": {"name": "bigbio/lll", "description": "The LLL05 challenge task is to learn rules to extract protein/gene interactions from biology abstracts from the Medline\nbibliography database. The goal of the challenge is to test the ability of the participating IE systems to identify the\ninteractions and the gene/proteins that interact. The participants will test their IE patterns on a test set with the\naim of extracting the correct agent and target.The challenge focuses on information extraction of gene interactions in\nBacillus subtilis. Extracting gene interaction is the most popular event IE task in biology. Bacillus subtilis (Bs) is\na model bacterium and many papers have been published on direct gene interactions involved in sporulation. The gene\ninteractions are generally mentioned in the abstract and the full text of the paper is not needed. Extracting gene\ninteraction means, extracting the agent (proteins) and the target (genes) of all couples of genic interactions from\nsentences.", "evaluation_metadata": {}}, "bigbio/mayosrs": {"name": "bigbio/mayosrs", "description": "MayoSRS consists of 101 clinical term pairs whose relatedness was determined by nine medical coders and three physicians from the Mayo Clinic.", "evaluation_metadata": {}}, "bigbio/med_qa": {"name": "bigbio/med_qa", "description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.", "evaluation_metadata": {}}, "bigbio/meddialog": {"name": "bigbio/meddialog", "description": "The MedDialog dataset (English) contains conversations (in English) between doctors and patients.It has 0.26 million dialogues. The data is continuously growing and more dialogues will be added. The raw dialogues are from healthcaremagic.com and icliniq.com.\nAll copyrights of the data belong to healthcaremagic.com and icliniq.com.", "evaluation_metadata": {}}, "bigbio/meddocan": {"name": "bigbio/meddocan", "description": "MEDDOCAN: Medical Document Anonymization Track\n\nThis dataset is designed for the MEDDOCAN task, sponsored by Plan de Impulso de las Tecnolog\u00edas del Lenguaje.\n\nIt is a manually classified collection of 1,000 clinical case reports derived from the Spanish Clinical Case Corpus (SPACCC), enriched with PHI expressions.\n\nThe annotation of the entire set of entity mentions was carried out by experts annotatorsand it includes 29 entity types relevant for the annonymiation of medical documents.22 of these annotation types are actually present in the corpus: TERRITORIO, FECHAS, EDAD_SUJETO_ASISTENCIA, NOMBRE_SUJETO_ASISTENCIA, NOMBRE_PERSONAL_SANITARIO, SEXO_SUJETO_ASISTENCIA, CALLE, PAIS, ID_SUJETO_ASISTENCIA, CORREO, ID_TITULACION_PERSONAL_SANITARIO,ID_ASEGURAMIENTO, HOSPITAL, FAMILIARES_SUJETO_ASISTENCIA, INSTITUCION, ID_CONTACTO ASISTENCIAL,NUMERO_TELEFONO, PROFESION, NUMERO_FAX, OTROS_SUJETO_ASISTENCIA, CENTRO_SALUD, ID_EMPLEO_PERSONAL_SANITARIO\n \nFor further information, please visit https://temu.bsc.es/meddocan/ or send an email to encargo-pln-life@bsc.es", "evaluation_metadata": {}}, "bigbio/medhop": {"name": "bigbio/medhop", "description": "With the same format as WikiHop, this dataset is based on research paper\nabstracts from PubMed, and the queries are about interactions between\npairs of drugs. The correct answer has to be inferred by combining\ninformation from a chain of reactions of drugs and proteins.", "evaluation_metadata": {}}, "bigbio/mediqa_qa": {"name": "bigbio/mediqa_qa", "description": "The MEDIQA challenge is an ACL-BioNLP 2019 shared task aiming to attract further research efforts in Natural Language Inference (NLI), Recognizing Question Entailment (RQE), and their applications in medical Question Answering (QA).\nMailing List: https://groups.google.com/forum/#!forum/bionlp-mediqa\n\nIn the QA task, participants are tasked to:\n- filter/classify the provided answers (1: correct, 0: incorrect).\n- re-rank the answers.", "evaluation_metadata": {}}, "bigbio/mediqa_rqe": {"name": "bigbio/mediqa_rqe", "description": "The MEDIQA challenge is an ACL-BioNLP 2019 shared task aiming to attract further research efforts in Natural Language Inference (NLI), Recognizing Question Entailment (RQE), and their applications in medical Question Answering (QA).\nMailing List: https://groups.google.com/forum/#!forum/bionlp-mediqa\n\nThe objective of the RQE task is to identify entailment between two questions in the context of QA. We use the following definition of question entailment: \u201ca question A entails a question B if every answer to B is also a complete or partial answer to A\u201d [1]\n [1] A. Ben Abacha & D. Demner-Fushman. \u201cRecognizing Question Entailment for Medical Question Answering\u201d. AMIA 2016.", "evaluation_metadata": {}}, "bigbio/medmentions": {"name": "bigbio/medmentions", "description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.", "evaluation_metadata": {}}, "bigbio/meqsum": {"name": "bigbio/meqsum", "description": "Dataset for medical question summarization introduced in the ACL 2019 paper \"On the Summarization of Consumer Health\nQuestions\". Question understanding is one of the main challenges in question answering. In real world applications,\nusers often submit natural language questions that are longer than needed and include peripheral information that\nincreases the complexity of the question, leading to substantially more false positives in answer retrieval. In this\npaper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000\nsummarized consumer health questions.", "evaluation_metadata": {}}, "bigbio/minimayosrs": {"name": "bigbio/minimayosrs", "description": "MiniMayoSRS is a subset of the MayoSRS and consists of 30 term pairs on which a higher inter-annotator agreement was\nachieved. The average correlation between physicians is 0.68. The average correlation between medical coders is 0.78.", "evaluation_metadata": {}}, "bigbio/mirna": {"name": "bigbio/mirna", "description": "The corpus consists of 301 Medline citations. The documents were screened for\nmentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually\nannotated. The corpus comprises of two separate files, a train and a test set, coming\nfrom 201 and 100 documents respectively.", "evaluation_metadata": {}}, "bigbio/mlee": {"name": "bigbio/mlee", "description": "MLEE is an event extraction corpus consisting of manually annotated abstracts of papers\non angiogenesis. It contains annotations for entities, relations, events and coreferences\nThe annotations span molecular, cellular, tissue, and organ-level processes.", "evaluation_metadata": {}}, "bigbio/mqp": {"name": "bigbio/mqp", "description": "Medical Question Pairs dataset by McCreery et al (2020) contains pairs of medical questions and paraphrased versions of \nthe question prepared by medical professional. Paraphrased versions were labelled as similar (syntactically dissimilar \nbut contextually similar ) or dissimilar (syntactically may look similar but contextually dissimilar). Labels 1: similar, 0: dissimilar", "evaluation_metadata": {}}, "bigbio/muchmore": {"name": "bigbio/muchmore", "description": "The corpus used in the MuchMore project is a parallel corpus of English-German scientific\nmedical abstracts obtained from the Springer Link web site. The corpus consists\napproximately of 1 million tokens for each language. Abstracts are from 41 medical\njournals, each of which constitutes a relatively homogeneous medical sub-domain (e.g.\nNeurology, Radiology, etc.). The corpus of downloaded HTML documents is normalized in\nvarious ways, in order to produce a clean, plain text version, consisting of a title, abstract\nand keywords. Additionally, the corpus was aligned on the sentence level.\n\nAutomatic (!) annotation includes: Part-of-Speech; Morphology (inflection and\ndecomposition); Chunks; Semantic Classes (UMLS: Unified Medical Language System,\nMeSH: Medical Subject Headings, EuroWordNet); Semantic Relations from UMLS.", "evaluation_metadata": {}}, "bigbio/multi_xscience": {"name": "bigbio/multi_xscience", "description": "Multi-document summarization is a challenging task for which there exists little large-scale datasets. \nWe propose Multi-XScience, a large-scale multi-document summarization dataset created from scientific articles. \nMulti-XScience introduces a challenging multi-document summarization task: writing the related-work section \nof a paper based on its abstract and the articles it references. Our work is inspired by extreme summarization, \na dataset construction protocol that favours abstractive modeling approaches. Descriptive statistics and \nempirical results---using several state-of-the-art models trained on the Multi-XScience dataset---reveal t\nhat Multi-XScience is well suited for abstractive models.", "evaluation_metadata": {}}, "bigbio/osiris": {"name": "bigbio/osiris", "description": "The OSIRIS corpus is a set of MEDLINE abstracts manually annotated\nwith human variation mentions. The corpus is distributed under the terms\nof the Creative Commons Attribution License\nCreative Commons Attribution 3.0 Unported License,\nwhich permits unrestricted use, distribution, and reproduction in any medium,\nprovided the original work is properly cited (Furlong et al, BMC Bioinformatics 2008, 9:84).", "evaluation_metadata": {}}, "bigbio/paramed": {"name": "bigbio/paramed", "description": "NEJM is a Chinese-English parallel corpus crawled from the New England Journal of Medicine website. \nEnglish articles are distributed through https://www.nejm.org/ and Chinese articles are distributed through \nhttp://nejmqianyan.cn/. The corpus contains all article pairs (around 2000 pairs) since 2011.", "evaluation_metadata": {}}, "bigbio/pdr": {"name": "bigbio/pdr", "description": "The corpus of plant-disease relation consists of plants and diseases and their relation to PubMed abstract.\nThe corpus consists of about 2400 plant and disease entities and 300 annotated relations from 179 abstracts.", "evaluation_metadata": {}}, "bigbio/pharmaconer": {"name": "bigbio/pharmaconer", "description": "PharmaCoNER: Pharmacological Substances, Compounds and Proteins Named Entity Recognition track\n\nThis dataset is designed for the PharmaCoNER task, sponsored by Plan de Impulso de las Tecnolog\u00edas del Lenguaje.\n\nIt is a manually classified collection of clinical case studies derived from the Spanish Clinical Case Corpus (SPACCC), an open access electronic library that gathers Spanish medical publications from SciELO (Scientific Electronic Library Online).\n\nThe annotation of the entire set of entity mentions was carried out by medicinal chemistry experts and it includes the following 4 entity types: NORMALIZABLES, NO_NORMALIZABLES, PROTEINAS and UNCLEAR.\n\nThe PharmaCoNER corpus contains a total of 396,988 words and 1,000 clinical cases that have been randomly sampled into 3 subsets. The training set contains 500 clinical cases, while the development and test sets contain 250 clinical cases each.\n\nFor further information, please visit https://temu.bsc.es/pharmaconer/ or send an email to encargo-pln-life@bsc.es", "evaluation_metadata": {}}, "bigbio/pico_extraction": {"name": "bigbio/pico_extraction", "description": "This dataset contains annotations for Participants, Interventions, and Outcomes (referred to as PICO task).\nFor 423 sentences, annotations collected by 3 medical experts are available.\nTo get the final annotations, we perform the majority voting.", "evaluation_metadata": {}}, "bigbio/pmc_patients": {"name": "bigbio/pmc_patients", "description": "This dataset is used for calculating the similarity between two patient descriptions.", "evaluation_metadata": {}}, "bigbio/progene": {"name": "bigbio/progene", "description": "The Protein/Gene corpus was developed at the JULIE Lab Jena under supervision of Prof. Udo Hahn.\nThe executing scientist was Dr. Joachim Wermter.\nThe main annotator was Dr. Rico Pusch who is an expert in biology.\nThe corpus was developed in the context of the StemNet project (http://www.stemnet.de/).", "evaluation_metadata": {}}, "bigbio/pubhealth": {"name": "bigbio/pubhealth", "description": "A dataset of 11,832 claims for fact- checking, which are related a range of health topics\nincluding biomedical subjects (e.g., infectious diseases, stem cell research), government healthcare policy\n(e.g., abortion, mental health, women\u2019s health), and other public health-related stories", "evaluation_metadata": {}}, "bigbio/pubmed_qa": {"name": "bigbio/pubmed_qa", "description": "PubMedQA is a novel biomedical question answering (QA) dataset collected from PubMed abstracts.\nThe task of PubMedQA is to answer research biomedical questions with yes/no/maybe using the corresponding abstracts.\nPubMedQA has 1k expert-annotated (PQA-L), 61.2k unlabeled (PQA-U) and 211.3k artificially generated QA instances (PQA-A).\n\nEach PubMedQA instance is composed of:\n (1) a question which is either an existing research article title or derived from one,\n (2) a context which is the corresponding PubMed abstract without its conclusion,\n (3) a long answer, which is the conclusion of the abstract and, presumably, answers the research question, and\n (4) a yes/no/maybe answer which summarizes the conclusion.\n\nPubMedQA is the first QA dataset where reasoning over biomedical research texts,\nespecially their quantitative contents, is required to answer the questions.\n\nPubMedQA datasets comprise of 3 different subsets:\n (1) PubMedQA Labeled (PQA-L): A labeled PubMedQA subset comprises of 1k manually annotated yes/no/maybe QA data collected from PubMed articles.\n (2) PubMedQA Artificial (PQA-A): An artificially labelled PubMedQA subset comprises of 211.3k PubMed articles with automatically generated questions from the statement titles and yes/no answer labels generated using a simple heuristic.\n (3) PubMedQA Unlabeled (PQA-U): An unlabeled PubMedQA subset comprises of 61.2k context-question pairs data collected from PubMed articles.", "evaluation_metadata": {}}, "bigbio/quaero": {"name": "bigbio/quaero", "description": "The QUAERO French Medical Corpus has been initially developed as a resource for named entity recognition and normalization [1]. It was then improved with the purpose of creating a gold standard set of normalized entities for French biomedical text, that was used in the CLEF eHealth evaluation lab [2][3].\n\nA selection of MEDLINE titles and EMEA documents were manually annotated. The annotation process was guided by concepts in the Unified Medical Language System (UMLS):\n\n1. Ten types of clinical entities, as defined by the following UMLS Semantic Groups (Bodenreider and McCray 2003) were annotated: Anatomy, Chemical and Drugs, Devices, Disorders, Geographic Areas, Living Beings, Objects, Phenomena, Physiology, Procedures.\n\n2. The annotations were made in a comprehensive fashion, so that nested entities were marked, and entities could be mapped to more than one UMLS concept. In particular: (a) If a mention can refer to more than one Semantic Group, all the relevant Semantic Groups should be annotated. For instance, the mention \u201cr\u00e9cidive\u201d (recurrence) in the phrase \u201cpr\u00e9vention des r\u00e9cidives\u201d (recurrence prevention) should be annotated with the category \u201cDISORDER\u201d (CUI C2825055) and the category \u201cPHENOMENON\u201d (CUI C0034897); (b) If a mention can refer to more than one UMLS concept within the same Semantic Group, all the relevant concepts should be annotated. For instance, the mention \u201cmaniaques\u201d (obsessive) in the phrase \u201cpatients maniaques\u201d (obsessive patients) should be annotated with CUIs C0564408 and C0338831 (category \u201cDISORDER\u201d); (c) Entities which span overlaps with that of another entity should still be annotated. For instance, in the phrase \u201cinfarctus du myocarde\u201d (myocardial infarction), the mention \u201cmyocarde\u201d (myocardium) should be annotated with category \u201cANATOMY\u201d (CUI C0027061) and the mention \u201cinfarctus du myocarde\u201d should be annotated with category \u201cDISORDER\u201d (CUI C0027051)\n\nThe QUAERO French Medical Corpus BioC release comprises a subset of the QUAERO French Medical corpus, as follows:\n\nTraining data (BRAT version used in CLEF eHealth 2015 task 1b as training data): \n- MEDLINE_train_bioc file: 833 MEDLINE titles, annotated with normalized entities in the BioC format \n- EMEA_train_bioc file: 3 EMEA documents, segmented into 11 sub-documents, annotated with normalized entities in the BioC format \n\nDevelopment data (BRAT version used in CLEF eHealth 2015 task 1b as test data and in CLEF eHealth 2016 task 2 as development data): \n- MEDLINE_dev_bioc file: 832 MEDLINE titles, annotated with normalized entities in the BioC format\n- EMEA_dev_bioc file: 3 EMEA documents, segmented into 12 sub-documents, annotated with normalized entities in the BioC format \n\nTest data (BRAT version used in CLEF eHealth 2016 task 2 as test data): \n- MEDLINE_test_bioc folder: 833 MEDLINE titles, annotated with normalized entities in the BioC format \n- EMEA folder_test_bioc: 4 EMEA documents, segmented into 15 sub-documents, annotated with normalized entities in the BioC format \n\n\n\nThis release of the QUAERO French medical corpus, BioC version, comes in the BioC format, through automatic conversion from the original BRAT format obtained with the Brat2BioC tool https://bitbucket.org/nicta_biomed/brat2bioc developped by Jimeno Yepes et al.\n\nAntonio Jimeno Yepes, Mariana Neves, Karin Verspoor \nBrat2BioC: conversion tool between brat and BioC\nBioCreative IV track 1 - BioC: The BioCreative Interoperability Initiative, 2013\n\n\nPlease note that the original version of the QUAERO corpus distributed in the CLEF eHealth challenge 2015 and 2016 came in the BRAT stand alone format. It was distributed with the CLEF eHealth evaluation tool. This original distribution of the QUAERO French Medical corpus is available separately from https://quaerofrenchmed.limsi.fr \n\nAll questions regarding the task or data should be addressed to aurelie.neveol@limsi.fr", "evaluation_metadata": {}}, "bigbio/scai_chemical": {"name": "bigbio/scai_chemical", "description": "SCAI Chemical is a corpus of MEDLINE abstracts that has been annotated\nto give an overview of the different chemical name classes\nfound in MEDLINE text.", "evaluation_metadata": {}}, "bigbio/scai_disease": {"name": "bigbio/scai_disease", "description": "SCAI Disease is a dataset annotated in 2010 with mentions of diseases and\nadverse effects. It is a corpus containing 400 randomly selected MEDLINE\nabstracts generated using \u2018Disease OR Adverse effect\u2019 as a PubMed query. This\nevaluation corpus was annotated by two individuals who hold a Master\u2019s degree\nin life sciences.", "evaluation_metadata": {}}, "bigbio/scicite": {"name": "bigbio/scicite", "description": "SciCite is a dataset of 11K manually annotated citation intents based on\ncitation context in the computer science and biomedical domains.", "evaluation_metadata": {}}, "bigbio/scielo": {"name": "bigbio/scielo", "description": "A parallel corpus of full-text scientific articles collected from Scielo database in the following languages: English, Portuguese and Spanish. The corpus is sentence aligned for all language pairs, as well as trilingual aligned for a small subset of sentences. Alignment was carried out using the Hunalign algorithm.", "evaluation_metadata": {}}, "bigbio/scifact": {"name": "bigbio/scifact", "description": " {_DESCRIPTION_BASE} This config connects the claims to the evidence and doc ids.", "evaluation_metadata": {}}, "bigbio/sciq": {"name": "bigbio/sciq", "description": "The SciQ dataset contains 13,679 crowdsourced science exam questions about Physics, Chemistry and Biology, among others. The questions are in multiple-choice format with 4 answer options each. For most questions, an additional paragraph with supporting evidence for the correct answer is provided.", "evaluation_metadata": {}}, "bigbio/spl_adr_200db": {"name": "bigbio/spl_adr_200db", "description": "The United States Food and Drug Administration (FDA) partnered with the National Library\nof Medicine to create a pilot dataset containing standardised information about known\nadverse reactions for 200 FDA-approved drugs. The Structured Product Labels (SPLs),\nthe documents FDA uses to exchange information about drugs and other products, were\nmanually annotated for adverse reactions at the mention level to facilitate development\nand evaluation of text mining tools for extraction of ADRs from all SPLs. The ADRs were\nthen normalised to the Unified Medical Language System (UMLS) and to the Medical\nDictionary for Regulatory Activities (MedDRA).", "evaluation_metadata": {}}, "bigbio/swedish_medical_ner": {"name": "bigbio/swedish_medical_ner", "description": "swedish_medical_ner is Named Entity Recognition dataset on medical text in Swedish. \nIt consists three subsets which are in turn derived from three different sources \nrespectively: the Swedish Wikipedia (a.k.a. wiki), L\u00e4kartidningen (a.k.a. lt), \nand 1177 V\u00e5rdguiden (a.k.a. 1177). While the Swedish Wikipedia and L\u00e4kartidningen \nsubsets in total contains over 790000 sequences with 60 characters each, \nthe 1177 V\u00e5rdguiden subset is manually annotated and contains 927 sentences, \n2740 annotations, out of which 1574 are disorder and findings, 546 are \npharmaceutical drug, and 620 are body structure.\n\nTexts from both Swedish Wikipedia and L\u00e4kartidningen were automatically annotated \nusing a list of medical seed terms. Sentences from 1177 V\u00e5rdguiden were manuually \nannotated.", "evaluation_metadata": {}}, "bigbio/tmvar_v1": {"name": "bigbio/tmvar_v1", "description": "This dataset contains 500 PubMed articles manually annotated with mutation mentions of various kinds. It can be used for NER tasks only.\nThe dataset is split into train(334) and test(166) splits", "evaluation_metadata": {}}, "bigbio/tmvar_v2": {"name": "bigbio/tmvar_v2", "description": "This dataset contains 158 PubMed articles manually annotated with mutation mentions of various kinds and dbsnp normalizations for each of them.\nIt can be used for NER tasks and NED tasks, This dataset has a single split", "evaluation_metadata": {}}, "bigbio/twadrl": {"name": "bigbio/twadrl", "description": "The TwADR-L dataset contains medical concepts written on social media (Twitter) mapped to how they are formally written in medical ontologies (SIDER 4). \\", "evaluation_metadata": {}}, "bigbio/umnsrs": {"name": "bigbio/umnsrs", "description": "UMNSRS, developed by Pakhomov, et al., consists of 725 clinical term pairs whose semantic similarity and relatedness.\nThe similarity and relatedness of each term pair was annotated based on a continuous scale by having the resident touch\na bar on a touch sensitive computer screen to indicate the degree of similarity or relatedness.\nThe following subsets are available:\n- similarity: A set of 566 UMLS concept pairs manually rated for semantic similarity (e.g. whale-dolphin) using a\n continuous response scale.\n- relatedness: A set of 588 UMLS concept pairs manually rated for semantic relatedness (e.g. needle-thread) using a\n continuous response scale.\n- similarity_mod: Modification of the UMNSRS-Similarity dataset to exclude control samples and those pairs that did not\n match text in clinical, biomedical and general English corpora. Exact modifications are detailed in the paper (Corpus\n Domain Effects on Distributional Semantic Modeling of Medical Terms. Serguei V.S. Pakhomov, Greg Finley, Reed McEwan,\n Yan Wang, and Genevieve B. Melton. Bioinformatics. 2016; 32(23):3635-3644). The resulting dataset contains 449 pairs.\n- relatedness_mod: Modification of the UMNSRS-Relatedness dataset to exclude control samples and those pairs that did\n not match text in clinical, biomedical and general English corpora. Exact modifications are detailed in the paper\n (Corpus Domain Effects on Distributional Semantic Modeling of Medical Terms. Serguei V.S. Pakhomov, Greg Finley,\n Reed McEwan, Yan Wang, and Genevieve B. Melton. Bioinformatics. 2016; 32(23):3635-3644).\n The resulting dataset contains 458 pairs.", "evaluation_metadata": {}}, "bigbio/verspoor_2013": {"name": "bigbio/verspoor_2013", "description": "This dataset contains annotations for a small corpus of full text journal publications on the subject of inherited colorectal cancer. It is suitable for Named Entity Recognition and Relation Extraction tasks. It uses the Variome Annotation Schema, a schema that aims to capture the core concepts and relations relevant to cataloguing and interpreting human genetic variation and its relationship to disease, as described in the published literature. The schema was inspired by the needs of the database curators of the International Society for Gastrointestinal Hereditary Tumours (InSiGHT) database, but is intended to have application to genetic variation information in a range of diseases.", "evaluation_metadata": {}}, "Murple/mmcrsc": {"name": "Murple/mmcrsc", "description": "The corpus by Magic Data Technology Co., Ltd. , containing 755 hours of scripted read speech data \nfrom 1080 native speakers of the Mandarin Chinese spoken in mainland China. \nThe sentence transcription accuracy is higher than 98%.", "evaluation_metadata": {}}, "rubentito/OCR-IDL": {"name": "rubentito/OCR-IDL", "description": " The OCR-IDL Dataset contains the OCR annotations of 26M pages of theIndustry Document Library (IDL). It is specially intended to be used for text-layout self-supervised tasks such as Masked Language Modeling or Text De-noising. However, we also include the url to the documents so that can be downloaded for image-text alignment tasks.", "evaluation_metadata": {}}, "cjvt/si_nli": {"name": "cjvt/si_nli", "description": "SI-NLI (Slovene Natural Language Inference Dataset) contains 5,937 human-created Slovene sentence pairs \n(premise and hypothesis) that are manually labeled with the labels \"entailment\", \"contradiction\", and \"neutral\". \nThe dataset was created using sentences that appear in the Slovenian reference corpus ccKres. \nAnnotators were tasked to modify the hypothesis in a candidate pair in a way that reflects one of the labels. \nThe dataset is balanced since the annotators created three modifications (entailment, contradiction, neutral) \nfor each candidate sentence pair.", "evaluation_metadata": {}}, "yuansui/GitTables": {"name": "yuansui/GitTables", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "Fazzie/Teyvat": {"name": "Fazzie/Teyvat", "description": "Teyvat is the first small-scale text-to-image prompt dataset for Genshin impact.", "evaluation_metadata": {}}, "jpwahle/etpc": {"name": "jpwahle/etpc", "description": "The EPT typology addresses several practical limitations of existing paraphrase typologies: it is the first typology that copes with the non-paraphrase pairs in the paraphrase identification corpora and distinguishes between contextual and habitual paraphrase types. ETPC is the largest corpus to date annotated with atomic paraphrase types.", "evaluation_metadata": {}}, "joelito/MultiLegalPile_Chunks_500": {"name": "joelito/MultiLegalPile_Chunks_500", "description": "A chunked version of the MultiLegalPile dataset.", "evaluation_metadata": {}}, "carlosdanielhernandezmena/dummy_corpus_asr_es": {"name": "carlosdanielhernandezmena/dummy_corpus_asr_es", "description": "An extremely small corpus of 40 audio files taken from Common Voice (es) with the objective of testing how to share datasets in Hugging Face.", "evaluation_metadata": {}}, "PlanTL-GOB-ES/sts-es": {"name": "PlanTL-GOB-ES/sts-es", "description": "For Semantic Text Similarity, we collected the Spanish test sets from SemEval-2014 (Agirre et al., 2014) and SemEval-2015 (Agirre et al., 2015). Since no training data was provided for the Spanish subtask, we randomly sampled both datasets into 1,321 sentences for the train set, 78 sentences for the development set, and 156 sentences for the test set. To make the task harder for the models, we purposely made the development set smaller than the test set.", "evaluation_metadata": {}}, "severo/mnist": {"name": "severo/mnist", "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.", "evaluation_metadata": {}}, "carlosdanielhernandezmena/toy_corpus_asr_es": {"name": "carlosdanielhernandezmena/toy_corpus_asr_es", "description": "An extremely small corpus of 40 audio files taken from Common Voice (es) with the objective of testing how to share datasets in Hugging Face.", "evaluation_metadata": {}}, "joelito/MultiLegalPile_Wikipedia_Filtered": {"name": "joelito/MultiLegalPile_Wikipedia_Filtered", "description": "A filtered version of the MultiLegalPile dataset, together with wikipedia articles.", "evaluation_metadata": {}}, "hoskinson-center/proofnet": {"name": "hoskinson-center/proofnet", "description": "A dataset that evaluates formally proving and autoformalizing undergraduate mathematics.", "evaluation_metadata": {}}, "PlanTL-GOB-ES/WikiCAT_esv2": {"name": "PlanTL-GOB-ES/WikiCAT_esv2", "description": "WikiCAT: Text Classification Spanish dataset from the Viquipedia", "evaluation_metadata": {}}, "arbml/arabic_pos_dialect": {"name": "arbml/arabic_pos_dialect", "description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.", "evaluation_metadata": {}}, "carlosdanielhernandezmena/ravnursson_asr": {"name": "carlosdanielhernandezmena/ravnursson_asr", "description": "The corpus \\\"RAVNURSSON FAROESE SPEECH AND TRANSCRIPTS\\\" (or RAVNURSSON Corpus for short) is a collection of speech recordings with transcriptions intended for Automatic Speech Recognition (ASR) applications in the language that is spoken at the Faroe Islands (Faroese). It was curated at the Reykjav\u00edk University (RU) in 2022.", "evaluation_metadata": {}}, "Freed-Wu/kodak": {"name": "Freed-Wu/kodak", "description": "The pictures below link to lossless, true color (24 bits per pixel, aka \"full\ncolor\") images. It is my understanding they have been released by the Eastman\nKodak Company for unrestricted usage. Many sites use them as a standard test\nsuite for compression testing, etc. Prior to this site, they were only\navailable in the Sun Raster format via ftp. This meant that the images could\nnot be previewed before downloading. Since their release, however, the lossless\nPNG format has been incorporated into all the major browsers. Since PNG\nsupports 24-bit lossless color (which GIF and JPEG do not), it is now possible\nto offer this browser-friendly access to the images.", "evaluation_metadata": {}}, "joelito/EU_Wikipedias": {"name": "joelito/EU_Wikipedias", "description": "Wikipedia dataset containing cleaned articles of all languages.\nThe datasets are built from the Wikipedia dump\n(https://dumps.wikimedia.org/) with one split per language. Each example\ncontains the content of one full Wikipedia article with cleaning to strip\nmarkdown and unwanted sections (references, etc.).", "evaluation_metadata": {}}, "research-backup/semeval2012_relational_similarity_v6": {"name": "research-backup/semeval2012_relational_similarity_v6", "description": "[SemEVAL 2012 task 2: Relational Similarity](https://aclanthology.org/S12-1047/)", "evaluation_metadata": {}}, "research-backup/semeval2012_relational_similarity_v7": {"name": "research-backup/semeval2012_relational_similarity_v7", "description": "[SemEVAL 2012 task 2: Relational Similarity](https://aclanthology.org/S12-1047/)", "evaluation_metadata": {}}, "jeanlee/kmhas_korean_hate_speech": {"name": "jeanlee/kmhas_korean_hate_speech", "description": "The K-MHaS (Korean Multi-label Hate Speech) dataset contains 109k utterances from Korean online news comments labeled with 8 fine-grained hate speech classes or Not Hate Speech class.\nThe fine-grained hate speech classes are politics, origin, physical, age, gender, religion, race, and profanity and these categories are selected in order to reflect the social and historical context.", "evaluation_metadata": {}}, "gsarti/mt_geneval": {"name": "gsarti/mt_geneval", "description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.", "evaluation_metadata": {}}, "sagnikrayc/snli-cf-kaushik": {"name": "sagnikrayc/snli-cf-kaushik", "description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). In the ICLR 2020 paper [Learning the Difference that Makes a Difference with Counterfactually-Augmented Data](https://openreview.net/forum?id=Sklgs0NFvr), Kaushik et. al. provided a dataset with counterfactual perturbations on the SNLI and IMDB data. This repository contains the original and counterfactual perturbations for the SNLI data, which was generated after processing the original data from [here](https://github.com/acmi-lab/counterfactually-augmented-data).", "evaluation_metadata": {}}, "ciempiess/ciempiess_test": {"name": "ciempiess/ciempiess_test", "description": "The CIEMPIESS TEST Corpus is a gender balanced corpus destined to test acoustic models for the speech recognition task. The corpus was manually transcribed and it contains audio recordings from 10 male and 10 female speakers. The CIEMPIESS TEST is one of the three corpora included at the LDC's \\\"CIEMPIESS Experimentation\\\" (LDC2019S07).", "evaluation_metadata": {}}, "sayakpaul/nyu_depth_v2": {"name": "sayakpaul/nyu_depth_v2", "description": "The NYU-Depth V2 data set is comprised of video sequences from a variety of indoor scenes as recorded by both the RGB and Depth cameras from the Microsoft Kinect.", "evaluation_metadata": {}}, "Sociovestix/lenu": {"name": "Sociovestix/lenu", "description": " This dataset contains legal entity names from the Global LEI System in\n which each entity is assigned with a unique Legal Entity Identifier (LEI)\n code (ISO Standard 17441) along with their corresponding Entity Legal\n Form (ELF) Codes (ISO Standard 20275) which specifies the legal form of\n each entity.", "evaluation_metadata": {}}, "ML-Projects-Kiel/tweetyface": {"name": "ML-Projects-Kiel/tweetyface", "description": "Dataset containing Tweets from prominent Twitter Users in various languages. The dataset has been created utilizing a crawler for the Twitter API.\\n \\", "evaluation_metadata": {}}, "shayand/coqa_squad": {"name": "shayand/coqa_squad", "description": "CoQA SQuAD: A Conversational Question Answering Challenge Modified for SQuAD format", "evaluation_metadata": {}}, "nlphuji/vasr": {"name": "nlphuji/vasr", "description": "VASR is a challenging dataset for evaluating computer vision commonsense reasoning abilities. Given a triplet of images, the task is to select an image candidate B' that completes the analogy (A to A' is like B to what?). Unlike previous work on visual analogy that focused on simple image transformations, we tackle complex analogies requiring understanding of scenes. Our experiments demonstrate that state-of-the-art models struggle with carefully chosen distractors (\u00b153%, compared to 90% human accuracy).", "evaluation_metadata": {}}, "dattatreya303/covid-qa-tts": {"name": "dattatreya303/covid-qa-tts", "description": "This dataset is adapted from the covid_qa_deepset model card, by adding train/validation splits in the split generator function.", "evaluation_metadata": {}}, "MLRS/masri_test": {"name": "MLRS/masri_test", "description": "The MASRI-TEST CORPUS was created out of YouTube videos belonging to the channel of the University of Malta. It has a length of 1 hour and it is gender balanced, as it has the same number of male and female speakers.", "evaluation_metadata": {}}, "MLRS/masri_dev": {"name": "MLRS/masri_dev", "description": "The MASRI-DEV CORPUS was created out of YouTube videos belonging to the channel of the University of Malta. It has a length of 1 hour and it is gender balanced, as it has the same number of male and female speakers.", "evaluation_metadata": {}}, "TomTBT/pmc_open_access_section": {"name": "TomTBT/pmc_open_access_section", "description": "The PMC Open Access Subset includes more than 3.4 million journal articles and preprints that are made available under\nlicense terms that allow reuse. \nNot all articles in PMC are available for text mining and other reuse, many have copyright protection, however articles\nin the PMC Open Access Subset are made available under Creative Commons or similar licenses that generally allow more\nliberal redistribution and reuse than a traditional copyrighted work.\nThe PMC Open Access Subset is one part of the PMC Article Datasets\n\nThis version takes XML version as source, benefiting from the structured text\nto split the articles in sections, naming the introduction, methods, results,\ndiscussion and conclusion, front, body and back. XML is then removed and format\nit to plain text.", "evaluation_metadata": {}}, "language-and-voice-lab/samromur_children": {"name": "language-and-voice-lab/samromur_children", "description": "The Samr\u00f3mur Children corpus contains more than 137000 validated speech-recordings uttered by Icelandic children.", "evaluation_metadata": {}}, "lmqg/qa_squad": {"name": "lmqg/qa_squad", "description": "SQuAD with the train/validation/test split used in SQuAD QG", "evaluation_metadata": {}}, "language-and-voice-lab/raddromur_asr": {"name": "language-and-voice-lab/raddromur_asr", "description": "The Raddr\u00f3mur Corpus is intended for the speech recognition field and it is made out of radio podcasts mostly taken from R\u00daV (ruv.is). Such podcasts were selected because they contained a text script that matches with certain fidelity what is said during the show. After automatic segmentation of the episodes, the transcriptions were inferred using the scripts along with a forced alignment technique.", "evaluation_metadata": {}}, "AlienKevin/klee": {"name": "AlienKevin/klee", "description": "128x128 PNG images of kanjis from the Klee font", "evaluation_metadata": {}}, "language-and-voice-lab/malromur_asr": {"name": "language-and-voice-lab/malromur_asr", "description": "The M\u00e1lr\u00f3mur corpus is an open source corpus of Icelandic voice samples.", "evaluation_metadata": {}}, "AlienKevin/serif_klee": {"name": "AlienKevin/serif_klee", "description": "Horizontally concatenated 256x128 PNG images of kanjis in two fonts.\nThe left kanji is in the Source Han SC Serif font designed for Simplified Chinese\nThe right kanji is in the Klee font designed for Japanese", "evaluation_metadata": {}}, "cjvt/cc_gigafida": {"name": "cjvt/cc_gigafida", "description": "The ccGigafida corpus contains a subsample of the Gigafida corpus. The Gigafida corpus is an extensive collection of \nSlovene text of various genres, from daily newspapers, magazines, all kinds of books (fiction, non-fiction, textbooks), \nweb pages, transcriptions of parliamentary debates and similar.", "evaluation_metadata": {}}, "vesteinn/swe-nerc": {"name": "vesteinn/swe-nerc", "description": "The corpus consists of ca. 150.000 words of text.", "evaluation_metadata": {}}, "shunk031/jsnli": {"name": "shunk031/jsnli", "description": "== \u65e5\u672c\u8a9eSNLI(JSNLI)\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 ==\n\nSNLI \u30b3\u30fc\u30d1\u30b9\u3092\u65e5\u672c\u8a9e\u306b\u7ffb\u8a33\u3057\u305f\u81ea\u7136\u8a00\u8a9e\u63a8\u8ad6\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\n\u5b66\u7fd2\u30c7\u30fc\u30bf\u306f\u5143\u30c7\u30fc\u30bf\u3092\u7ffb\u8a33\u3057\u3001\u8a08\u7b97\u6a5f\u306b\u3088\u308b\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u306b\u3088\u3063\u3066\u4f5c\u6210\n\u8a55\u4fa1\u30c7\u30fc\u30bf\u306f\u65e5\u672c\u8a9e\u3068\u3057\u3066\u610f\u5473\u304c\u901a\u308b\u304b\u3001\u7ffb\u8a33\u5f8c\u306e\u30e9\u30d9\u30eb\u304c\u5143\u306e\u30e9\u30d9\u30eb\u3068\u4e00\u81f4\u3057\u3066\u3044\u308b\u304b\u3069\u3046\u304b\u306e2\u6bb5\u968e\u306e\u30af\u30e9\u30a6\u30c9\u30bd\u30fc\u30b7\u30f3\u30b0\u306b\u3088\u308a\u30c7\u30fc\u30bf\u3092\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0", "evaluation_metadata": {}}, "dattatreya303/covid-qa-synthetic": {"name": "dattatreya303/covid-qa-synthetic", "description": "This dataset is adapted from the contexts of covid_qa_deepset dataset card. All QA pairs here have been synthetically generated.", "evaluation_metadata": {}}, "lmqg/qg_tweetqa": {"name": "lmqg/qg_tweetqa", "description": "Question generation dataset based on [TweetQA](https://huggingface.co/datasets/tweet_qa).", "evaluation_metadata": {}}, "AlienKevin/kanjivg_klee": {"name": "AlienKevin/kanjivg_klee", "description": "Horizontally concatenated 256x128 PNG images of kanjis in two fonts.\nThe left kanji is in the KanjiVG font with stroke order indicated by colors.\nSee https://github.com/KanjiVG/kanjivg for more info.\nThe right kanji is in the Klee font designed for Japanese", "evaluation_metadata": {}}, "egm517/hupd_augmented": {"name": "egm517/hupd_augmented", "description": "The Harvard USPTO Patent Dataset (HUPD) is a large-scale, well-structured, and multi-purpose corpus \nof English-language patent applications filed to the United States Patent and Trademark Office (USPTO) \nbetween 2004 and 2018. With more than 4.5 million patent documents, HUPD is two to three times larger \nthan comparable corpora. Unlike other NLP patent datasets, HUPD contains the inventor-submitted versions \nof patent applications, not the final versions of granted patents, allowing us to study patentability at \nthe time of filing using NLP methods for the first time.", "evaluation_metadata": {}}, "its5Q/yandex-q": {"name": "its5Q/yandex-q", "description": "This is a dataset of questions and answers scraped from Yandex.Q.", "evaluation_metadata": {}}, "RobotsMaliAI/bayelemabaga": {"name": "RobotsMaliAI/bayelemabaga", "description": "The Bayelemabaga dataset is a collection of 44160 aligned machine translation ready Bambara-French lines, \noriginating from Corpus Bambara de Reference. The dataset is constitued of text extracted from 231 source files, \nvaring from periodicals, books, short stories, blog posts, part of the Bible and the Quran.", "evaluation_metadata": {}}, "parambharat/ucla_dataset": {"name": "parambharat/ucla_dataset", "description": "THE UCLA Tamil Labelled Total Duration contains 1160.24 hours of labelled ASR audio collected from various sources", "evaluation_metadata": {}}, "parambharat/mile_dataset": {"name": "parambharat/mile_dataset", "description": "IISc-MILE Tamil ASR Corpus contains transcribed speech corpus for training ASR systems for Tamil language. It contains ~150 hours of read speech data collected from 531 speakers in a noise-free recording environment with high quality USB microphones.", "evaluation_metadata": {}}, "phucdev/noisyner": {"name": "phucdev/noisyner", "description": "NoisyNER is a dataset for the evaluation of methods to handle noisy labels when training machine learning models. \nIt is from the NLP/Information Extraction domain and was created through a realistic distant supervision technique. \nSome highlights and interesting aspects of the data are:\n- Seven sets of labels with differing noise patterns to evaluate different noise levels on the same instances\n- Full parallel clean labels available to compute upper performance bounds or study scenarios where a small amount of \ngold-standard data can be leveraged\n- Skewed label distribution (typical for Named Entity Recognition tasks)\n- For some label sets: noise level higher than the true label probability\n- Sequential dependencies between the labels\n\nFor more details on the dataset and its creation process, please refer to our publication \nhttps://ojs.aaai.org/index.php/AAAI/article/view/16938 (published at AAAI'21).", "evaluation_metadata": {}}, "HuggingFaceM4/TextCaps": {"name": "HuggingFaceM4/TextCaps", "description": "extCaps requires models to read and reason about text in images to generate captions about them. Specifically, models need to incorporate a new modality of text present in the images and reason over it and visual content in the image to generate image descriptions.\nCurrent state-of-the-art models fail to generate captions for images in TextCaps because they do not have text reading and reasoning capabilities. See the examples in the image to compare ground truth answers and corresponding predictions by a state-of-the-art model.", "evaluation_metadata": {}}, "parambharat/tamil_asr_corpus": {"name": "parambharat/tamil_asr_corpus", "description": "The corpus contains roughly 1000 hours of audio and trasncripts in Tamil language. The transcripts have beedn de-duplicated using exact match deduplication.", "evaluation_metadata": {}}, "ipipan/nkjp1m": {"name": "ipipan/nkjp1m", "description": "This is the official dataset for NKJP1M \u2013 the 1-million token subcorpus of the\nNational Corpus of Polish (Narodowy Korpus J\u0119zyka Polskiego)\n\nBesides the text (divided into paragraphs/samples and sentences) the\nset contains lemmas and morpho-syntactic tags for all tokens in the corpus.\n\nThis release corresponds to the version 1.2 of the corpus with\nfollowing corrections and improvements. In particular the\nmorpho-syntactic annotation has been aligned with the present version\nof Morfeusz2 morphological analyser.", "evaluation_metadata": {}}, "albertvillanova/TextCaps": {"name": "albertvillanova/TextCaps", "description": "extCaps requires models to read and reason about text in images to generate captions about them. Specifically, models need to incorporate a new modality of text present in the images and reason over it and visual content in the image to generate image descriptions.\nCurrent state-of-the-art models fail to generate captions for images in TextCaps because they do not have text reading and reasoning capabilities. See the examples in the image to compare ground truth answers and corresponding predictions by a state-of-the-art model.", "evaluation_metadata": {}}, "HuggingFaceM4/NoCaps": {"name": "HuggingFaceM4/NoCaps", "description": "Dubbed NoCaps, for novel object captioning at scale, NoCaps consists of 166,100 human-generated captions describing 15,100 images from the Open Images validation and test sets.\nThe associated training data consists of COCO image-caption pairs, plus Open Images image-level labels and object bounding boxes.\nSince Open Images contains many more classes than COCO, nearly 400 object classes seen in test images have no or very few associated training captions (hence, nocaps).", "evaluation_metadata": {}}, "giulio98/xlcost_steps": {"name": "giulio98/xlcost_steps", "description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "evaluation_metadata": {}}, "language-and-voice-lab/althingi_asr": {"name": "language-and-voice-lab/althingi_asr", "description": "Althingi Parliamentary Speech consists of approximately 542 hours of recorded speech from Althingi, the Icelandic Parliament. Speeches date from 2005-2016.", "evaluation_metadata": {}}, "zmao/food_img_caption_small": {"name": "zmao/food_img_caption_small", "description": "For finetunning stable diffuser with food images", "evaluation_metadata": {}}, "parambharat/malayalam_asr_corpus": {"name": "parambharat/malayalam_asr_corpus", "description": "The corpus contains roughly 10 hours of audio and trasncripts in Malayalam language. The transcripts have beedn de-duplicated using exact match deduplication.", "evaluation_metadata": {}}, "AlienKevin/source_han_sans_ja_extra_light_left_right": {"name": "AlienKevin/source_han_sans_ja_extra_light_left_right", "description": "128x128 PNG images of Left-Right Chinese Characters in Big 5 and Adobe-Japan1-7.\nFont used is Source Han Sans J designed for Japanese.", "evaluation_metadata": {}}, "AlienKevin/source_han_sans_ja_regular_left_right": {"name": "AlienKevin/source_han_sans_ja_regular_left_right", "description": "128x128 PNG images of Left-Right Chinese Characters in Big 5 and Adobe-Japan1-7 (partial).\nFont used is Source Han Sans J Regular designed for Japanese.", "evaluation_metadata": {}}, "zmao/chinese_food_caption": {"name": "zmao/chinese_food_caption", "description": "For finetunning stable diffuser with chinese food images", "evaluation_metadata": {}}, "parambharat/kannada_asr_corpus": {"name": "parambharat/kannada_asr_corpus", "description": "The corpus contains roughly 360 hours of audio and transcripts in Kannada language. The transcripts have beed de-duplicated using exact match deduplication.", "evaluation_metadata": {}}, "parambharat/telugu_asr_corpus": {"name": "parambharat/telugu_asr_corpus", "description": "The corpus contains roughly 360 hours of audio and transcripts in Telugu language. The transcripts have beed de-duplicated using exact match deduplication.", "evaluation_metadata": {}}, "albertvillanova/universal_dependencies": {"name": "albertvillanova/universal_dependencies", "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).", "evaluation_metadata": {}}, "HuggingFaceM4/LocalizedNarratives": {"name": "HuggingFaceM4/LocalizedNarratives", "description": "Localized Narratives, a new form of multimodal image annotations connecting vision and language.\nWe ask annotators to describe an image with their voice while simultaneously hovering their mouse over the region they are describing.\nSince the voice and the mouse pointer are synchronized, we can localize every single word in the description.\nThis dense visual grounding takes the form of a mouse trace segment per word and is unique to our data.\nWe annotated 849k images with Localized Narratives: the whole COCO, Flickr30k, and ADE20K datasets, and 671k images of Open Images, all of which we make publicly available.", "evaluation_metadata": {}}, "masakhane/masakhaner2": {"name": "masakhane/masakhaner2", "description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811", "evaluation_metadata": {}}, "earlybyrd/plp_q_n_a": {"name": "earlybyrd/plp_q_n_a", "description": "This is just a test.", "evaluation_metadata": {}}, "EleutherAI/lambada_openai": {"name": "EleutherAI/lambada_openai", "description": "The LAMBADA dataset as processed by OpenAI. It is used to evaluate the capabilities\nof computational models for text understanding by means of a word prediction task.\nLAMBADA is a collection of narrative texts sharing the characteristic that human subjects\nare able to guess their last word if they are exposed to the whole text, but not\nif they only see the last sentence preceding the target word. To succeed on LAMBADA,\ncomputational models cannot simply rely on local context, but must be able to keep track\nof information in the broader discourse.\n\nReference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199", "evaluation_metadata": {}}, "JLD/unsplash25k-image-embeddings": {"name": "JLD/unsplash25k-image-embeddings", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "lmqg/qag_dequad": {"name": "lmqg/qag_dequad", "description": "Question & answer generation dataset based on SQuAD.", "evaluation_metadata": {}}, "lmqg/qag_koquad": {"name": "lmqg/qag_koquad", "description": "Question & answer generation dataset based on SQuAD.", "evaluation_metadata": {}}, "lmqg/qag_jaquad": {"name": "lmqg/qag_jaquad", "description": "Question & answer generation dataset based on SQuAD.", "evaluation_metadata": {}}, "lmqg/qag_ruquad": {"name": "lmqg/qag_ruquad", "description": "Question & answer generation dataset based on SQuAD.", "evaluation_metadata": {}}, "lmqg/qag_esquad": {"name": "lmqg/qag_esquad", "description": "Question & answer generation dataset based on SQuAD.", "evaluation_metadata": {}}, "lmqg/qag_itquad": {"name": "lmqg/qag_itquad", "description": "Question & answer generation dataset based on SQuAD.", "evaluation_metadata": {}}, "openai/webgpt_comparisons": {"name": "openai/webgpt_comparisons", "description": "WebGPT Comparisons contains all of the comparisons marked as suitable for reward modelling from the WebGPT paper.", "evaluation_metadata": {}}, "HIT-TMG/Hansel": {"name": "HIT-TMG/Hansel", "description": "Hansel is a high-quality human-annotated Chinese entity linking (EL) dataset, used for testing Chinese EL systems' generalization ability to tail entities and emerging entities.\nThe test set contains Few-shot (FS) and zero-shot (ZS) slices, has 10K examples and uses Wikidata as the corresponding knowledge base.\nThe training and validation sets are from Wikipedia hyperlinks, useful for large-scale pretraining of Chinese EL systems.", "evaluation_metadata": {}}, "orai-nlp/basqueGLUE": {"name": "orai-nlp/basqueGLUE", "description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.", "evaluation_metadata": {}}, "cjvt/sloleks": {"name": "cjvt/sloleks", "description": "Sloleks is a reference morphological lexicon of Slovene that was developed to be used in various NLP applications and language manuals. \\ \nIt contains Slovene lemmas, their inflected or derivative word forms and the corresponding grammatical description. In addition to the approx. 100,000 entries already available in Sloleks 2.0, Sloleks 3.0 contains an additional cca. 265,000 newly generated entries from the most frequent lemmas in Gigafida 2.0 not yet included in previous versions of Sloleks. For verbs, adjectives, adverbs, and common nouns, the lemmas were checked manually by three annotators and \\ \nincluded in Sloleks only if confirmed as legitimate by at least one annotator. No manual checking was performed on proper nouns.", "evaluation_metadata": {}}, "mrm8488/unnatural-instructions": {"name": "mrm8488/unnatural-instructions", "description": "Unnatural Instructions is a dataset of instructions automatically generated by a Large Language model. See full details in the paper: \"Unnatural Instructions: Tuning Language Models with (Almost) No Human Labor\" (https://arxiv.org/abs/2212.09689)", "evaluation_metadata": {}}, "neulab/docprompting-conala": {"name": "neulab/docprompting-conala", "description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "evaluation_metadata": {}}, "joost6196/go_emotions_dutch": {"name": "joost6196/go_emotions_dutch", "description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.", "evaluation_metadata": {}}, "neulab/tldr": {"name": "neulab/tldr", "description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "evaluation_metadata": {}}, "augsaksham/devrev": {"name": "augsaksham/devrev", "description": "combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers\n to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but\n also determine when no answer is supported by the paragraph and abstain from answering.", "evaluation_metadata": {}}, "openai/summarize_from_feedback": {"name": "openai/summarize_from_feedback", "description": "Summarize from Feedback contains the human feedback data released by the \"Learning to summarize from human feedback\" paper.", "evaluation_metadata": {}}, "aashsach/multiconer2": {"name": "aashsach/multiconer2", "description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition", "evaluation_metadata": {}}, "augsaksham/negdata": {"name": "augsaksham/negdata", "description": "combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers\n to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but\n also determine when no answer is supported by the paragraph and abstain from answering.", "evaluation_metadata": {}}, "tushar117/xalign": {"name": "tushar117/xalign", "description": "It consists of an extensive collection of a high quality cross-lingual fact-to-text dataset where facts are in English \nand corresponding sentences are in native language for person biographies. The Train & validation splits are created \nusing distant supervision methods and Test data is generated through human annotations.", "evaluation_metadata": {}}, "eloukas/edgar-corpus": {"name": "eloukas/edgar-corpus", "description": "The dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).", "evaluation_metadata": {}}, "tasksource/babi_nli": {"name": "tasksource/babi_nli", "description": "bAbi tasks recasted as natural language inference.", "evaluation_metadata": {}}, "JanosAudran/financial-reports-sec": {"name": "JanosAudran/financial-reports-sec", "description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.", "evaluation_metadata": {}}, "rohitp1/librispeech_asr_clean": {"name": "rohitp1/librispeech_asr_clean", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "jhu-clsp/bernice-pretrain-data": {"name": "jhu-clsp/bernice-pretrain-data", "description": "Tweet IDs for the 2.5 billion multilingual tweets used to train Bernice, a Twitter encoder.\nThe tweets are from the public 1% Twitter API stream from January 2016 to December 2021. \nTwitter-provided language metadata is provided with the tweet ID. The data contains 66 unique languages, \nas identified by ISO 639 language codes, including `und` for undefined languages.\nTweets need to be re-gathered via the Twitter API.", "evaluation_metadata": {}}, "mqddb/test-dataset": {"name": "mqddb/test-dataset", "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.", "evaluation_metadata": {}}, "HugoLaurencon/IIIT-5K": {"name": "HugoLaurencon/IIIT-5K", "description": "The IIIT 5K-Word dataset is harvested from Google image search. \nQuery words like billboards, signboard, house numbers, house name plates, movie posters were used to collect images. \nThe dataset contains 5000 cropped word images from Scene Texts and born-digital images. \nThe dataset is divided into train and test parts. \nThis dataset can be used for large lexicon cropped word recognition. \nWe also provide a lexicon of more than 0.5 million dictionary words with this dataset.", "evaluation_metadata": {}}, "zpn/zinc20": {"name": "zpn/zinc20", "description": "This dataset contains ~1B molecules from ZINC20, with their SMILES and SELFIES representations.", "evaluation_metadata": {}}, "lintang/numerical_reasoning_arithmetic": {"name": "lintang/numerical_reasoning_arithmetic", "description": " Generated dataset for testing numerical reasoning", "evaluation_metadata": {}}, "metaeval/imppres": {"name": "metaeval/imppres", "description": "Over >25k semiautomatically generated sentence pairs illustrating well-studied pragmatic inference types. IMPPRES is an NLI dataset following the format of SNLI (Bowman et al., 2015), MultiNLI (Williams et al., 2018) and XNLI (Conneau et al., 2018), which was created to evaluate how well trained NLI models recognize several classes of presuppositions and scalar implicatures.", "evaluation_metadata": {}}, "bigbio/drugprot": {"name": "bigbio/drugprot", "description": "The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships between them corresponding to a specific set of biologically relevant relation types.", "evaluation_metadata": {}}, "bigbio/cpi": {"name": "bigbio/cpi", "description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships", "evaluation_metadata": {}}, "DFKI-SLT/gids": {"name": "DFKI-SLT/gids", "description": "Google-IISc Distant Supervision (GIDS) is a new dataset for distantly-supervised relation extraction.\nGIDS is seeded from the human-judged Google relation extraction corpus.", "evaluation_metadata": {}}, "DFKI-SLT/kbp37": {"name": "DFKI-SLT/kbp37", "description": "KBP37 is a revision of MIML-RE annotation dataset, provided by Gabor Angeli et al. (2014). They use both the 2010 and \n2013 KBP official document collections, as well as a July 2013 dump of Wikipedia as the text corpus for annotation. \nThere are 33811 sentences been annotated. Zhang and Wang made several refinements:\n1. They add direction to the relation names, e.g. '`per:employee_of`' is split into '`per:employee of(e1,e2)`'\nand '`per:employee of(e2,e1)`'. They also replace '`org:parents`' with '`org:subsidiaries`' and replace\n'`org:member of\u2019 with '`org:member`' (by their reverse directions).\n2. They discard low frequency relations such that both directions of each relation occur more than 100 times in the \ndataset.\n\nKBP37 contains 18 directional relations and an additional '`no_relation`' relation, resulting in 37 relation classes.", "evaluation_metadata": {}}, "metaeval/utilitarianism": {"name": "metaeval/utilitarianism", "description": "\"\"\"\n_HOMEPAGE = \"\"\n_LICENSE = \"Creative Commons Attribution-NonCommercial 4.0 International Public License\"\n\n# The HuggingFace dataset library don't host the datasets but only point to the original files\n# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)\n_URLs = {\"default\": \"https://www.dropbox.com/s/041prrjylv0tf0h/ethics.zip?dl=1\"}\n\n\nclass Imppres(datasets.GeneratorBasedBuilder):\n\n VERSION = datasets.Version(\"1.1.0\")\n\n def _info(self):\n features = datasets.Features(\n {\n \"better_choice\": datasets.Value(\"string\"),\n \"worst_choice\": datasets.Value(\"string\"),\n \"comparison\": datasets.Value(\"string\"),\n \"label\": datasets.Value(\"int32\"),\n })\n return datasets.DatasetInfo(\n # This is the description that will appear on the datasets page.\n description=_DESCRIPTION,\n # This defines the different columns of the dataset and their types\n features=features, # Here we define them above because they are different between the two configurations\n # If there's a common (input, target) tuple from the features,\n # specify them here. They'll be used if as_supervised=True in\n # builder.as_dataset.\n supervised_keys=None,\n # Homepage of the dataset for documentation\n homepage=_HOMEPAGE,\n # License for the dataset if available\n license=_LICENSE,\n # Citation for the dataset\n citation=_CITATION,\n )\n\n def _split_generators(self, dl_manager):", "evaluation_metadata": {}}, "Geawher/Entityrecongnitionjobs": {"name": "Geawher/Entityrecongnitionjobs", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "neulab/odex": {"name": "neulab/odex", "description": "ODEX is an Open-Domain EXecution-based NL-to-Code generation data benchmark. \nIt contains 945 samples with a total of 1,707 human-written test cases, \ncovering intents in four different natural languages -- 439 in English, 90 in Spanish, 164 in Japanese, and 252 in Russian.", "evaluation_metadata": {}}, "FatmaZahraZ/JobDecriptionsEntityRecognition": {"name": "FatmaZahraZ/JobDecriptionsEntityRecognition", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "bileldh/conll2003": {"name": "bileldh/conll2003", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "firas-meddeb98/dataset_nlp": {"name": "firas-meddeb98/dataset_nlp", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "achrefla/Ds_achref": {"name": "achrefla/Ds_achref", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "AmenAllah/DataSet": {"name": "AmenAllah/DataSet", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "mariem1994/nlp_project": {"name": "mariem1994/nlp_project", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "Lemswasabi/luxembourgish-asr-rtl-lu": {"name": "Lemswasabi/luxembourgish-asr-rtl-lu", "description": "luxembourgish-asr-rtl-lu dataset is a speech corpus for the under-resourced Luxembourgish language.", "evaluation_metadata": {}}, "khalidalt/Joud": {"name": "khalidalt/Joud", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "Ineract/policies": {"name": "Ineract/policies", "description": "Manually generated dataset for policies qa", "evaluation_metadata": {}}, "kuroneko5943/jd21": {"name": "kuroneko5943/jd21", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": {}}, "kuroneko5943/amz20": {"name": "kuroneko5943/amz20", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": {}}, "kuroneko5943/snap21": {"name": "kuroneko5943/snap21", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": {}}, "kuroneko5943/stock11": {"name": "kuroneko5943/stock11", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": {}}, "kuroneko5943/weibo16": {"name": "kuroneko5943/weibo16", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": {}}, "sustcsenlp/bn_emotion_speech_corpus": {"name": "sustcsenlp/bn_emotion_speech_corpus", "description": "SUST Bangla Emotional Speech Coropus Dataset", "evaluation_metadata": {}}, "Ineract/policies-named-insured": {"name": "Ineract/policies-named-insured", "description": "Manually generated dataset for policies qa", "evaluation_metadata": {}}, "shunk031/wrime": {"name": "shunk031/wrime", "description": "WRIME dataset is a new dataset for emotional intensity estimation with subjective and objective annotations.", "evaluation_metadata": {}}, "lucasmccabe/logiqa": {"name": "lucasmccabe/logiqa", "description": "LogiQA is constructed from the logical comprehension problems from publically available questions of the National Civil Servants Examination of China, which are designed to test the civil servant candidates\u2019 critical thinking and problem solving. This dataset includes the English versions only; the Chinese versions are available via the homepage/original source.", "evaluation_metadata": {}}, "nlp-thedeep/humset": {"name": "nlp-thedeep/humset", "description": "HumSet is a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. HumSet is curated by humanitarian analysts and covers various disasters around the globe that occurred from 2018 to 2021 in 46 humanitarian response projects. The dataset consists of approximately 17K annotated documents in three languages of English, French, and Spanish, originally taken from publicly-available resources. For each document, analysts have identified informative snippets (entries) in respect to common humanitarian frameworks, and assigned one or many classes to each entry. See the our paper for details.", "evaluation_metadata": {}}, "Achitha/simple_tamil": {"name": "Achitha/simple_tamil", "description": "The data contains roughly one and half hours of audio and transcripts in Tamil language.", "evaluation_metadata": {}}, "KETI-AIR/kowow": {"name": "KETI-AIR/kowow", "description": "# KoWoW: Korean Wizard of Wikipedia\n- WoW(Wizard of Wikipedia) \ub370\uc774\ud130\uc14b\uc744 \ud55c\uad6d\uc5b4\ub85c \ubc88\uc5ed\ud55c \ub370\uc774\ud130\uc14b \n\n## Data\n\n- en: KoWoW En-En, Knowledge-English, Utterance-English\n- ko: KoWoW Ko-Ko, Knowledge-Korean, Utterance-Korean\n- ek: KoWoW En-Ko, Knowledge-English, Utterance-Korean\n- ke: KoWoW Ko-En, Knowledge-Korean, Utterance-English\n\n## Usage\n```python\nimport datasets\n\nraw_datsets = datasets.load_dataset(\n \"kowow.py\",\n \"kowow.ko.random.v1.0\",\n cache_dir=\"huggingface_datasets\", \n data_dir=\"data/ko\", # choose en, ko, ek, or ke\n)\n\n```", "evaluation_metadata": {}}, "DFKI-SLT/fabner": {"name": "DFKI-SLT/fabner", "description": "FabNER is a manufacturing text corpus of 350,000+ words for Named Entity Recognition.\nIt is a collection of abstracts obtained from Web of Science through known journals available in manufacturing process \nscience research.\nFor every word, there were categories/entity labels defined namely Material (MATE), Manufacturing Process (MANP), \nMachine/Equipment (MACEQ), Application (APPL), Features (FEAT), Mechanical Properties (PRO), Characterization (CHAR), \nParameters (PARA), Enabling Technology (ENAT), Concept/Principles (CONPRI), Manufacturing Standards (MANS) and \nBioMedical (BIOP). Annotation was performed in all categories along with the output tag in 'BIOES' format: \nB=Beginning, I-Intermediate, O=Outside, E=End, S=Single.", "evaluation_metadata": {}}, "Yaxin/ZhangABSADataset": {"name": "Yaxin/ZhangABSADataset", "description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\nThe train/validation/test sets are available in Spanish and Dutch.\nFor more details see https://www.clips.uantwerpen.be/semeval2016/ner/ and https://www.aclweb.org/anthology/W02-2024/", "evaluation_metadata": {}}, "Yaxin/SemEval2020Task9CodeSwitch": {"name": "Yaxin/SemEval2020Task9CodeSwitch", "description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\nThe train/validation/test sets are available in Spanish and Dutch.\nFor more details see https://www.clips.uantwerpen.be/semeval2016/ner/ and https://www.aclweb.org/anthology/W02-2024/", "evaluation_metadata": {}}, "ruanchaves/hatebr": {"name": "ruanchaves/hatebr", "description": "HateBR is the first large-scale expert annotated corpus of Brazilian Instagram comments for hate speech and offensive language detection on the web and social media. The HateBR corpus was collected from Brazilian Instagram comments of politicians and manually annotated by specialists. It is composed of 7,000 documents annotated according to three different layers: a binary classification (offensive versus non-offensive comments), offensiveness-level (highly, moderately, and slightly offensive messages), and nine hate speech groups (xenophobia, racism, homophobia, sexism, religious intolerance, partyism, apology for the dictatorship, antisemitism, and fatphobia). Each comment was annotated by three different annotators and achieved high inter-annotator agreement. Furthermore, baseline experiments were implemented reaching 85% of F1-score outperforming the current literature models for the Portuguese language. Accordingly, we hope that the proposed expertly annotated corpus may foster research on hate speech and offensive language detection in the Natural Language Processing area.", "evaluation_metadata": {}}, "alexandrainst/scandi-wiki": {"name": "alexandrainst/scandi-wiki", "description": "ScandiWiki is a parsed and deduplicated version of the Danish, Norwegian Bokm\u00e5l,\nNorwegian Nynorsk, Swedish, Icelandic and Faroese Wikipedia corpora, as of January\n2023.", "evaluation_metadata": {}}, "AI4EPS/quakeflow_nc": {"name": "AI4EPS/quakeflow_nc", "description": "A dataset of earthquake waveforms organized by earthquake events and based on the HDF5 format.", "evaluation_metadata": {}}, "ds4sd/DocLayNet": {"name": "ds4sd/DocLayNet", "description": "DocLayNet is a human-annotated document layout segmentation dataset from a broad variety of document sources.", "evaluation_metadata": {}}, "bigcode/commitpack": {"name": "bigcode/commitpack", "description": "\"\"\"\n\nURL = \"https://huggingface.co/datasets/bigcode/commitpack/resolve/main/paths.json\"\n\n_LANG = [\"json\", \"xml\", \"text\", \"javascript\", \"objective-c++\", \"python\", \"c\", \"c++\", \"markdown\", \"java\", \"html\", \"yaml\", \"go\", \"csv\", \"php\", \"jupyter-notebook\", \"gettext-catalog\", \"sql\", \"unity3d-asset\", \"typescript\", \"web-ontology-language\", \"ruby\", \"c#\", \"nix\", \"shell\", \"perl\", \"tex\", \"css\", \"restructuredtext\", \"rust\", \"groff\", \"ini\", \"scala\", \"coffeescript\", \"haskell\", \"swift\", \"lua\", \"svg\", \"gas\", \"ocaml\", \"erlang\", \"makefile\", \"asciidoc\", \"emacs-lisp\", \"scss\", \"clojure\", \"org\", \"common-lisp\", \"diff\", \"groovy\", \"html+erb\", \"nesc\", \"dart\", \"powershell\", \"f#\", \"dm\", \"kotlin\", \"pascal\", \"jsx\", \"viml\", \"actionscript\", \"cython\", \"turtle\", \"less\", \"mathematica\", \"xslt\", \"scheme\", \"perl6\", \"edn\", \"fortran\", \"java-server-pages\", \"standard-ml\", \"cmake\", \"json5\", \"vala\", \"vue\", \"freemarker\", \"graphql\", \"twig\", \"tcl\", \"pod\", \"dockerfile\", \"yacc\", \"postscript\", \"racket\", \"eagle\", \"haxe\", \"julia\", \"handlebars\", \"smarty\", \"visual-basic\", \"literate-haskell\", \"smalltalk\", \"isabelle\", \"nimrod\", \"zig\", \"m4\", \"max\", \"elixir\", \"mako\", \"arduino\", \"jade\", \"haml\", \"elm\", \"purebasic\", \"coldfusion\", \"lean\", \"r\", \"cuda\", \"textile\", \"robotframework\", \"abap\", \"rdoc\", \"llvm\", \"ada\", \"batchfile\", \"qml\", \"jasmin\", \"assembly\", \"g-code\", \"cucumber\", \"html+php\", \"kicad\", \"api-blueprint\", \"eiffel\", \"toml\", \"modelica\", \"bitbake\", \"lex\", \"stylus\", \"protocol-buffer\", \"unknown\", \"nit\", \"factor\", \"xs\", \"sass\", \"parrot-internal-representation\", \"html+django\", \"mediawiki\", \"logos\", \"genshi\", \"coldfusion-cfc\", \"xtend\", \"sqf\", \"vhdl\", \"antlr\", \"systemverilog\", \"hcl\", \"asp\", \"nsis\", \"inform-7\", \"slim\", \"groovy-server-pages\", \"ceylon\", \"fish\", \"processing\", \"component-pascal\", \"lasso\", \"glsl\", \"saltstack\", \"xbase\", \"autohotkey\", \"liquid\", \"purescript\", \"agda\", \"inno-setup\", \"oz\", \"chapel\", \"arc\", \"opencl\", \"graphviz-dot\", \"pawn\", \"jsoniq\", \"bluespec\", \"smali\", \"krl\", \"maple\", \"unrealscript\", \"ooc\", \"pure-data\", \"xquery\", \"digital-command-language\", \"moonscript\", \"awk\", \"pike\", \"livescript\", \"solidity\", \"monkey\", \"jsonld\", \"zephir\", \"crystal\", \"rhtml\", \"stata\", \"idris\", \"raml\", \"openscad\", \"red\", \"c2hs-haskell\", \"cycript\", \"applescript\", \"mupad\", \"literate-agda\", \"boo\", \"sourcepawn\", \"qmake\", \"ragel-in-ruby-host\", \"io\", \"desktop\", \"propeller-spin\", \"thrift\", \"volt\", \"xproc\", \"igor-pro\", \"lolcode\", \"html+eex\", \"logtalk\", \"mirah\", \"gnuplot\", \"literate-coffeescript\", \"jflex\", \"emberscript\", \"cobol\", \"yang\", \"rebol\", \"linker-script\", \"cartocss\", \"urweb\", \"rmarkdown\", \"darcs-patch\", \"csound\", \"squirrel\", \"apl\", \"hlsl\", \"latte\", \"pony\", \"ioke\", \"hy\", \"uno\", \"pan\", \"xojo\", \"papyrus\", \"stan\", \"slash\", \"supercollider\", \"vcl\", \"smt\", \"glyph\", \"wisp\", \"renpy\", \"clips\", \"dns-zone\", \"sas\", \"rouge\", \"ec\", \"dylan\", \"tcsh\", \"aspectj\", \"netlogo\", \"gap\", \"fancy\", \"coq\", \"click\", \"capn-proto\", \"flux\", \"forth\", \"ats\", \"netlinx\", \"clean\", \"parrot-assembly\", \"alloy\", \"lfe\", \"gdscript\", \"augeas\", \"sparql\", \"lilypond\", \"scilab\", \"autoit\", \"myghty\", \"blitzmax\", \"creole\", \"harbour\", \"piglatin\", \"opa\", \"sage\", \"ston\", \"maxscript\", \"lsl\", \"gentoo-ebuild\", \"nu\", \"bro\", \"xc\", \"j\", \"metal\", \"module-management-system\", \"webidl\", \"tea\", \"redcode\", \"shen\", \"pov-ray-sdl\", \"x10\", \"brainfuck\", \"ninja\", \"golo\", \"webassembly\", \"self\", \"labview\", \"octave\", \"pogoscript\", \"d\", \"http\", \"ecl\", \"chuck\", \"gosu\", \"parrot\", \"opal\", \"objective-j\", \"kit\", \"gams\", \"prolog\", \"clarion\", \"mask\", \"brightscript\", \"scaml\", \"matlab\", \"idl\", \"ags-script\", \"lookml\", \"apacheconf\", \"oxygene\", \"txl\", \"grammatical-framework\", \"renderscript\", \"mtml\", \"unified-parallel-c\", \"dogescript\", \"gentoo-eclass\", \"zimpl\", \"irc-log\", \"fantom\", \"numpy\", \"cirru\", \"xpages\", \"nginx\", \"objdump\", \"python-traceback\", \"realbasic\", \"befunge\", \"bison\", \"m\", \"omgrofl\"]\n\n_LICENSE = \"Apache License 2.0\"\n_VERSION = datasets.Version(\"1.0.0\", \"\")\n\n\nclass CommitPack(datasets.GeneratorBasedBuilder):\n BUILDER_CONFIGS = [\n datasets.BuilderConfig(\n name=lang,\n description=f\"CommitPack {lang}\",\n version=_VERSION,\n )\n for lang in _LANG\n ]\n\n def _info(self):\n return datasets.DatasetInfo(\n description=_DESCRIPTION,\n features=datasets.Features(\n {\n \"commit\": datasets.Value(\"string\"),\n \"old_file\": datasets.Value(\"string\"),\n \"new_file\": datasets.Value(\"string\"),\n \"old_contents\": datasets.Value(\"string\"),\n \"new_contents\": datasets.Value(\"string\"),\n \"subject\": datasets.Value(\"string\"),\n \"message\": datasets.Value(\"string\"),\n \"lang\": datasets.Value(\"string\"),\n \"license\": datasets.Value(\"string\"),\n \"repos\": datasets.Value(\"string\"),\n# \"returncode\": datasets.Value(\"int64\"),\n# \"stderr\": datasets.Value(\"string\"),\n }\n ),\n supervised_keys=None,\n citation=_CITATION,\n )\n \n def _split_generators(self, dl_manager):\n\n path_file = dl_manager.download(URL)\n with open(path_file, \"r\") as f:\n files = json.load(f)\n\n downloaded_files = dl_manager.download(files[self.config.name])\n return [\n datasets.SplitGenerator(\n name=datasets.Split.TRAIN,\n gen_kwargs={'filepaths': downloaded_files}\n )\n ]\n\n def _generate_examples(self, filepaths):", "evaluation_metadata": {}}, "polinaeterna/lila_camera_traps": {"name": "polinaeterna/lila_camera_traps", "description": "LILA Camera Traps is an aggregate data set of images taken by camera traps, which are devices that automatically (e.g. via motion detection) capture images of wild animals to help ecological research.\n\nThis data set is the first time when disparate camera trap data sets have been aggregated into a single training environment with a single taxonomy.\n\nThis data set consists of only camera trap image data sets, whereas the broader LILA website also has other data sets related to biology and conservation, intended as a resource for both machine learning (ML) researchers and those that want to harness ML for this topic.", "evaluation_metadata": {}}, "mariosasko/glue": {"name": "mariosasko/glue", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": [{"config": "cola", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "sst2", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence": "text", "label": "target"}}, {"config": "mrpc", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "qqp", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question1": "text1", "question2": "text2", "label": "target"}}, {"config": "stsb", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "mnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation_matched"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_mismatched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "mnli_matched", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"premise": "text1", "hypothesis": "text2", "label": "target"}}, {"config": "qnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "text1", "sentence": "text2", "label": "target"}}, {"config": "rte", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}, {"config": "wnli", "task": "text-classification", "task_id": "natural_language_inference", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"sentence1": "text1", "sentence2": "text2", "label": "target"}}]}, "Hello-SimpleAI/HC3": {"name": "Hello-SimpleAI/HC3", "description": "Human ChatGPT Comparison Corpus (HC3)", "evaluation_metadata": {}}, "Hello-SimpleAI/HC3-Chinese": {"name": "Hello-SimpleAI/HC3-Chinese", "description": "Human ChatGPT Comparison Corpus (HC3) Chinese Version", "evaluation_metadata": {}}, "jonatli/the_pile_mystic": {"name": "jonatli/the_pile_mystic", "description": "The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality\ndatasets combined together.", "evaluation_metadata": {}}, "larrylawl/douban-dushu": {"name": "larrylawl/douban-dushu", "description": "This dataset contains book reviews from DouBan Dushu. DouBan DuShu is a Chinese website where users can share their reviews about various kinds of books. Most of the users in this website are unprofessional book reviewers. Therefore, the comments are usually spoken Chinese or even Internet slang.", "evaluation_metadata": {}}, "ruanchaves/b2w-reviews01": {"name": "ruanchaves/b2w-reviews01", "description": "B2W-Reviews01 is an open corpus of product reviews. It contains more than 130k e-commerce customer reviews, collected from the Americanas.com website between January and May, 2018. B2W-Reviews01 offers rich information about the reviewer profile, such as gender, age, and geographical location. The corpus also has two different review rates", "evaluation_metadata": {}}, "DFKI-SLT/knowledge_net": {"name": "DFKI-SLT/knowledge_net", "description": "KnowledgeNet is a benchmark dataset for the task of automatically populating a knowledge base (Wikidata) with facts \nexpressed in natural language text on the web. KnowledgeNet provides text exhaustively annotated with facts, thus \nenabling the holistic end-to-end evaluation of knowledge base population systems as a whole, unlike previous benchmarks \nthat are more suitable for the evaluation of individual subcomponents (e.g., entity linking, relation extraction).\n\nFor instance, the dataset contains text expressing the fact (Gennaro Basile; RESIDENCE; Moravia), in the passage: \n\"Gennaro Basile was an Italian painter, born in Naples but active in the German-speaking countries. He settled at Br\u00fcnn, \nin Moravia, and lived about 1756...\"\n\nFor a description of the dataset and baseline systems, please refer to their \n[EMNLP paper](https://github.com/diffbot/knowledge-net/blob/master/knowledgenet-emnlp-cameraready.pdf).\n\nNote: This Datasetreader currently only supports the `train` split and does not contain negative examples", "evaluation_metadata": {}}, "DFKI-SLT/cross_ner": {"name": "DFKI-SLT/cross_ner", "description": "CrossNER is a fully-labeled collected of named entity recognition (NER) data spanning over five diverse domains \n(Politics, Natural Science, Music, Literature, and Artificial Intelligence) with specialized entity categories for \ndifferent domains. Additionally, CrossNER also includes unlabeled domain-related corpora for the corresponding five \ndomains.\n\nFor details, see the paper: \n[CrossNER: Evaluating Cross-Domain Named Entity Recognition](https://arxiv.org/abs/2012.04373)", "evaluation_metadata": {}}, "DFKI-SLT/cross_re": {"name": "DFKI-SLT/cross_re", "description": "CrossRE is a new, freely-available crossdomain benchmark for RE, which comprises six distinct text domains and includes \nmultilabel annotations. It includes the following domains: news, politics, natural science, music, literature and \nartificial intelligence. The semantic relations are annotated on top of CrossNER (Liu et al., 2021), a cross-domain\ndataset for NER which contains domain-specific entity types.\nThe dataset contains 17 relation labels for the six domains: PART-OF, PHYSICAL, USAGE, ROLE, SOCIAL, \nGENERAL-AFFILIATION, COMPARE, TEMPORAL, ARTIFACT, ORIGIN, TOPIC, OPPOSITE, CAUSE-EFFECT, WIN-DEFEAT, TYPEOF, NAMED, and \nRELATED-TO.\n\nFor details, see the paper: https://arxiv.org/abs/2210.09345", "evaluation_metadata": {}}, "relbert/semeval2012_relational_similarity": {"name": "relbert/semeval2012_relational_similarity", "description": "[SemEVAL 2012 task 2: Relational Similarity](https://aclanthology.org/S12-1047/)", "evaluation_metadata": {}}, "zpn/GRCh38": {"name": "zpn/GRCh38", "description": "A dataset of all autosomal and sex chromosomes sequences from reference assembly GRCh38/hg38 1 and reached a total of 3.2 billion nucleotides.", "evaluation_metadata": {}}, "philschmid/emotion": {"name": "philschmid/emotion", "description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "maritaca-ai/ag_news_pt": {"name": "maritaca-ai/ag_news_pt", "description": "AG is a collection of more than 1 million news articles. News articles have been\ngathered from more than 2000 news sources by ComeToMyHead in more than 1 year of\nactivity. ComeToMyHead is an academic news search engine which has been running\nsince July, 2004. The dataset is provided by the academic comunity for research\npurposes in data mining (clustering, classification, etc), information retrieval\n(ranking, search, etc), xml, data compression, data streaming, and any other\nnon-commercial activity. For more information, please refer to the link\nhttp://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .\n\nThe AG's news topic classification dataset is constructed by Xiang Zhang\n(xiang.zhang@nyu.edu) from the dataset above. It is used as a text\nclassification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann\nLeCun. Character-level Convolutional Networks for Text Classification. Advances\nin Neural Information Processing Systems 28 (NIPS 2015).", "evaluation_metadata": {}}, "yhavinga/imdb_dutch": {"name": "yhavinga/imdb_dutch", "description": "Large Movie Review Dataset translated to Dutch.\n\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 24,992 highly polar movie reviews for training, and 24,992 for testing. There is additional unlabeled data for use as well.\\", "evaluation_metadata": [{"config": "plain_text", "task": "text-classification", "task_id": "binary_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy"}, {"name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "anjalyjayakrishnan/test": {"name": "anjalyjayakrishnan/test", "description": "The Snow Mountain dataset contains the audio recordings (in .mp3 format) and the corresponding text of The Bible \nin 11 Indian languages. The recordings were done in a studio setting by native speakers. Each language has a single \nspeaker in the dataset. Most of these languages are geographically concentrated in the Northern part of India around \nthe state of Himachal Pradesh. Being related to Hindi they all use the Devanagari script for transcription.", "evaluation_metadata": {}}, "imvladikon/parashoot": {"name": "imvladikon/parashoot", "description": "A Hebrew question and answering dataset in the style of SQuAD, based on articles scraped from Wikipedia. The dataset contains a few thousand crowdsource-annotated pairs of questions and answers, in a setting suitable for few-shot learning.", "evaluation_metadata": {}}, "relbert/scientific_and_creative_analogy": {"name": "relbert/scientific_and_creative_analogy", "description": "Dataset for relation mapping task (see [paper](https://arxiv.org/abs/2211.15268)).", "evaluation_metadata": {}}, "rcds/swiss_legislation": {"name": "rcds/swiss_legislation", "description": "This dataset contains Swiss law articles", "evaluation_metadata": {}}, "JeremyAlain/SLF5K": {"name": "JeremyAlain/SLF5K", "description": "The Summarization with Language Feedback (SLF5K) dataset is an English-language dataset containing 5K unique samples that can be used for the task of abstraction summarization. Each sample consists of a Reddit title and post, a model-generated (FeedME) summary, and human-written language feedback on that summary. Additionally, each sample has a high-quality, human-written (gold) summary that should be ideal for the Reddit post. Lastly, each sample has two additional model-generated summaries with binary human preference labels, on which summary is preferred by a human. The dataset can be used to train language models with language feedback on abstractive summarization. It can also be used to train a reward model on binary preferences.", "evaluation_metadata": {}}, "hfaus/CelebA_bbox_and_facepoints": {"name": "hfaus/CelebA_bbox_and_facepoints", "description": "CelebFaces Attributes Dataset (CelebA) is a large-scale face attributes dataset with more than 200K celebrity images,\neach with 40 attribute annotations. The images in this dataset cover large pose variations and background clutter.\nCelebA has large diversities, large quantities, and rich annotations, including 10,177 number of identities, 202,599 number of face images,\nand 5 landmark locations, 40 binary attributes annotations per image.", "evaluation_metadata": {}}, "indonlp/NusaX-senti": {"name": "indonlp/NusaX-senti", "description": "NusaX is a high-quality multilingual parallel corpus that covers 12 languages, Indonesian, English, and 10 Indonesian local languages, namely Acehnese, Balinese, Banjarese, Buginese, Madurese, Minangkabau, Javanese, Ngaju, Sundanese, and Toba Batak.\nNusaX-Senti is a 3-labels (positive, neutral, negative) sentiment analysis dataset for 10 Indonesian local languages + Indonesian and English.", "evaluation_metadata": {}}, "relbert/nell": {"name": "relbert/nell", "description": "Few shots link prediction dataset.", "evaluation_metadata": {}}, "awalesushil/DBLP-QuAD": {"name": "awalesushil/DBLP-QuAD", "description": " DBLP-QuAD is a scholarly knowledge graph question answering dataset with 10,000 question - SPARQL query pairs targeting the DBLP knowledge graph. The dataset is split into 7,000 training, 1,000 validation and 2,000 test questions.", "evaluation_metadata": {}}, "jordyvl/DUDE_loader": {"name": "jordyvl/DUDE_loader", "description": "DUDE requires models to reason and understand about document layouts in multi-page images/PDFs to answer questions about them.\nSpecifically, models need to incorporate a new modality of layout present in the images/PDFs and reason\nover it to answer DUDE questions.", "evaluation_metadata": {}}, "indonlp/NusaX-MT": {"name": "indonlp/NusaX-MT", "description": "NusaX is a high-quality multilingual parallel corpus that covers 12 languages, Indonesian, English, and 10 Indonesian local languages, namely Acehnese, Balinese, Banjarese, Buginese, Madurese, Minangkabau, Javanese, Ngaju, Sundanese, and Toba Batak.\nNusaX-MT is a parallel corpus for training and benchmarking machine translation models across 10 Indonesian local languages + Indonesian and English. The data is presented in csv format with 12 columns, one column for each language.", "evaluation_metadata": {}}, "liyucheng/chinese_metaphor_dataset": {"name": "liyucheng/chinese_metaphor_dataset", "description": "Chinese Metaphor Corpus\n\nThe first Chinese metaphor corpus serving both metaphor identification and generation. \n\u9996\u4e2a\u4e2d\u6587\u6bd4\u55bb\u6570\u636e\u96c6\uff0c\u53ef\u4ee5\u7528\u4e8e\u4e2d\u6587\u6bd4\u55bb\u8bc6\u522b\u4e0e\u4e2d\u6587\u6bd4\u55bb\u751f\u6210\u3002", "evaluation_metadata": {}}, "ncoop57/mmmlu": {"name": "ncoop57/mmmlu", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "polinaeterna/xsum": {"name": "polinaeterna/xsum", "description": "Extreme Summarization (XSum) Dataset.\n\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.", "evaluation_metadata": {}}, "pierreguillou/DocLayNet-small": {"name": "pierreguillou/DocLayNet-small", "description": "Accurate document layout analysis is a key requirement for high-quality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present \\textit{DocLayNet}, a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide smallline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10\\% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNet-trained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", "evaluation_metadata": {}}, "pierreguillou/DocLayNet-base": {"name": "pierreguillou/DocLayNet-base", "description": "Accurate document layout analysis is a key requirement for high-quality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present \\textit{DocLayNet}, a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide smallline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10\\% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNet-trained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", "evaluation_metadata": {}}, "gorar/A-MNIST": {"name": "gorar/A-MNIST", "description": "The dataset is built on top of MNIST.\nIt consists from 130K of images in 10 classes - 120K training and 10K test samples.\nThe training set was augmented with additional 60K images.", "evaluation_metadata": {}}, "maritaca-ai/imdb_pt": {"name": "maritaca-ai/imdb_pt", "description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.\\", "evaluation_metadata": {}}, "juletxara/xstory_cloze": {"name": "juletxara/xstory_cloze", "description": "XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.", "evaluation_metadata": {}}, "lunesco/conll2003": {"name": "lunesco/conll2003", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "maritaca-ai/sst2_pt": {"name": "maritaca-ai/sst2_pt", "description": "The Stanford Sentiment Treebank consists of sentences from movie reviews and\nhuman annotations of their sentiment. The task is to predict the sentiment of a\ngiven sentence. We use the two-way (positive/negative) class split, and use only\nsentence-level labels.", "evaluation_metadata": {}}, "maritaca-ai/boolq_pt": {"name": "maritaca-ai/boolq_pt", "description": "BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally\noccurring ---they are generated in unprompted and unconstrained settings.\nEach example is a triplet of (question, passage, answer), with the title of the page as optional additional context.\nThe text-pair classification setup is similar to existing natural language inference tasks.", "evaluation_metadata": {}}, "rcds/swiss_court_view_generation": {"name": "rcds/swiss_court_view_generation", "description": "This dataset contains court decision for court view generation task.", "evaluation_metadata": {}}, "TurkuNLP/squad_v2_fi": {"name": "TurkuNLP/squad_v2_fi", "description": "combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers\n to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but\n also determine when no answer is supported by the paragraph and abstain from answering.", "evaluation_metadata": {}}, "biu-nlp/alsqa": {"name": "biu-nlp/alsqa", "description": "To test the lexical overlap heuristic utilization in Reading Comprehension models, we create a new test set: Analyzing Lexically Similar QA (ALSQA).\nWe augment the SQuAD 2.0 dataset (Rajpurkar et al., 2018) by asking crowdworkers to generate questions with high context-overlap from questions with low overlap (These questions are paraphrases of the original questions).\nIn the case of un-answerable questions, annotators were asked to re-write the question without changing its meaning and maintain the unanswerability reason.3 ALSQA contains 365 questions pairs, 190 with an- swer and 174 without answer.", "evaluation_metadata": {}}, "tobiolatunji/afrispeech-200": {"name": "tobiolatunji/afrispeech-200", "description": "AFRISPEECH-200 is a 200hr Pan-African speech corpus for clinical and general domain English accented ASR; \na dataset with 120 African accents from 13 countries and 2,463 unique African speakers. \nOur goal is to raise awareness for and advance Pan-African English ASR research, \nespecially for the clinical domain.", "evaluation_metadata": {}}, "rvorias/realms_adventurers": {"name": "rvorias/realms_adventurers", "description": "This is the public dataset for the realms adventurer generator.\nIt contains images of characters and annotations to form structured captions.", "evaluation_metadata": {}}, "joelito/MultiLegalPileWikipediaFiltered": {"name": "joelito/MultiLegalPileWikipediaFiltered", "description": "A filtered version of the MultiLegalPile dataset, together with wikipedia articles.", "evaluation_metadata": {}}, "Basvoju/SemEval2018Task7": {"name": "Basvoju/SemEval2018Task7", "description": "This paper describes the first task on semantic relation extraction and classification in scientific paper\nabstracts at SemEval 2018. The challenge focuses on domain-specific semantic relations and includes three \ndifferent subtasks. The subtasks were designed so as to compare and quantify the effect of different\npre-processing steps on the relation classification results. We expect the task to be relevant for a broad \nrange of researchers working on extracting specialized knowledge from domain corpora, for example but not \nlimited to scientific or bio-medical information extraction. The task attracted a total of 32 participants, \nwith 158 submissions across different scenarios.", "evaluation_metadata": [{"col_mapping": {"labels": "tags", "tokens": "tokens"}, "config": "default", "splits": {"eval_split": "test"}, "task": "text-classification", "task_id": "entity_extraction"}]}, "ds4sd/icdar2023-doclaynet": {"name": "ds4sd/icdar2023-doclaynet", "description": "Dataset for the ICDAR 2023 Competition on Robust Layout Segmentation in Corporate Documents.", "evaluation_metadata": {}}, "bridgeconn/snow-mountain": {"name": "bridgeconn/snow-mountain", "description": "The Snow Mountain dataset contains the audio recordings (in .mp3 format) and the corresponding text of The Bible \nin 11 Indian languages. The recordings were done in a studio setting by native speakers. Each language has a single \nspeaker in the dataset. Most of these languages are geographically concentrated in the Northern part of India around \nthe state of Himachal Pradesh. Being related to Hindi they all use the Devanagari script for transcription.", "evaluation_metadata": {}}, "lukaemon/bbh": {"name": "lukaemon/bbh", "description": "BBH focuses on a suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). These are the task for which prior language model evaluations did not outperform the average human-rater. We find that applying chain-of-thought (CoT) prompting to BBH tasks enables PaLM to surpass the average humanrater performance on 10 of the 23 tasks, and Codex (code-davinci-002) to surpass the average human-rater performance on 17 of the 23 tasks. Since many tasks in BBH require multi-step reasoning, few-shot prompting without CoT, as done in the BIG-Bench evaluations (Srivastava et al., 2022), substantially underestimates the best performance and capabilities of language models, which is better captured via CoT prompting. As further analysis, we explore the interaction between CoT and model scale on BBH, finding that CoT enables emergent task performance on several BBH tasks with otherwise flat scaling curves.", "evaluation_metadata": {}}, "tasksource/mmlu": {"name": "tasksource/mmlu", "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.", "evaluation_metadata": {}}, "lukaemon/mmlu": {"name": "lukaemon/mmlu", "description": "Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).", "evaluation_metadata": {}}, "lunesco/conll2003-v2": {"name": "lunesco/conll2003-v2", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "DFKI-SLT/SemEval2018_Task7": {"name": "DFKI-SLT/SemEval2018_Task7", "description": "This paper describes the first task on semantic relation extraction and classification in scientific paper\nabstracts at SemEval 2018. The challenge focuses on domain-specific semantic relations and includes three \ndifferent subtasks. The subtasks were designed so as to compare and quantify the effect of different\npre-processing steps on the relation classification results. We expect the task to be relevant for a broad \nrange of researchers working on extracting specialized knowledge from domain corpora, for example but not \nlimited to scientific or bio-medical information extraction. The task attracted a total of 32 participants, \nwith 158 submissions across different scenarios.", "evaluation_metadata": [{"col_mapping": {"labels": "tags", "tokens": "tokens"}, "config": "default", "splits": {"eval_split": "test"}, "task": "text-classification", "task_id": "entity_extraction"}]}, "othertea/epigenetic_marks_pham2005": {"name": "othertea/epigenetic_marks_pham2005", "description": " This contains datasets of histone occupancy, acetylation, and methylation by ChiP-Chip protocol in vivo from Pham et al., as retrieved from https://www.jaist.ac.jp/~tran/nucleosome/members.htm in January 2023.", "evaluation_metadata": {}}, "biu-nlp/qa_adj": {"name": "biu-nlp/qa_adj", "description": "The dataset contains question-answer pairs to capture adjectival semantics. \nThis dataset was annotated by selected workers from Amazon Mechanical Turk.", "evaluation_metadata": {}}, "sartajekram/BanglaRQA": {"name": "sartajekram/BanglaRQA", "description": "BanglaRQA is a human-annotated Bangla Question Answering (QA) dataset with diverse question-answer types.", "evaluation_metadata": {}}, "ielab/xor-tydi": {"name": "ielab/xor-tydi", "description": "The english Wikipedia 2019-0201 passage dump that used for xor-tydi retrieval task, available at https://archive.org/download/enwiki-20190201/enwiki-20190201-pages-articles-multistream.xml.bz2", "evaluation_metadata": {}}, "allenai/lila": {"name": "allenai/lila", "description": "L\u012bla is a comprehensive benchmark for mathematical reasoning with over 140K natural language questions annotated with Python programs and natural language instructions. The data set comes with multiple splits: L\u012bla-IID (train, dev, test), L\u012bla-OOD (train, dev, test), and L\u012bla-Robust.", "evaluation_metadata": {}}, "raki-1203/ai_hub_summarization": {"name": "raki-1203/ai_hub_summarization", "description": "\"\"\"\n_LICENSE = \"CC-BY-SA-4.0\"\n# _URL = \"https://github.com/boostcampaitech2/data-annotation-nlp-level3-nlp-14\"\n_DATA_URLS = {\n \"train\": \"https://huggingface.co/datasets/raki-1203/ai_hub_summarization/resolve/main/train_dict.json\",\n \"valid\": \"https://huggingface.co/datasets/raki-1203/ai_hub_summarization/resolve/main/valid_dict.json\",\n \"test\": \"https://huggingface.co/datasets/raki-1203/ai_hub_summarization/resolve/main/test_dict.json\",\n}\n\n_VERSION = \"0.0.0\"\n\n\nclass AiHubSummarizationConfig(datasets.BuilderConfig):\n def __init__(self, data_url, **kwargs):\n super().__init__(version=datasets.Version(_VERSION), **kwargs)\n self.data_url = data_url\n\n\nclass AiHubSummarization(datasets.GeneratorBasedBuilder):\n DEFAULT_CONFIG_NAME = \"ai_hub_summarization\"\n BUILDER_CONFIGS = [\n AiHubSummarizationConfig(\n name=\"ai_hub_summarization\",\n data_url=_DATA_URLS,\n description=_DESCRIPTION,\n )\n ]\n\n def _info(self):\n return datasets.DatasetInfo(\n description=_DESCRIPTION,\n features=datasets.Features(\n {\n \"data_name\": datasets.Value(\"string\"),\n \"doc_id\": datasets.Value(\"string\"),\n \"doc_name\": datasets.Value(\"string\"),\n \"passage\": datasets.Value(\"string\"),\n \"abstract_summary\": datasets.Value(\"string\"),\n }\n ),\n license=_LICENSE,\n citation=_CITATION,\n supervised_keys=None,\n )\n\n def _split_generators(self, dl_manager):", "evaluation_metadata": {}}, "cjvt/senticoref": {"name": "cjvt/senticoref", "description": "Slovene corpus for coreference resolution. Contains automatically(?) annotated named entities, manually annotated \ncoreferences, and manually verified lemmas and morphosyntactic tags.", "evaluation_metadata": {}}, "neulab/mconala": {"name": "neulab/mconala", "description": "MCoNaLa is a Multilingual Code/Natural Language Challenge dataset with \n896 NL-Code pairs in three languages: Spanish, Japanese, and Russian.", "evaluation_metadata": {}}, "Achitha/tamil_eng_data": {"name": "Achitha/tamil_eng_data", "description": "The data contains roughly one and half hours of audio and transcripts in Tamil language.", "evaluation_metadata": {}}, "andstor/output": {"name": "andstor/output", "description": "This is a dataset consisting of the output from various language models and datasets.", "evaluation_metadata": {}}, "liwu/MNBVC": {"name": "liwu/MNBVC", "description": "MNBVC: Massive Never-ending BT Vast Chinese corpus", "evaluation_metadata": {}}, "HiTZ/euscrawl": {"name": "HiTZ/euscrawl", "description": "EusCrawl (http://www.ixa.eus/euscrawl/) is a high-quality corpus for\nBasque comprising 12.5 million documents and 423 million tokens,\ntotalling 2.1 GiB of uncompressed text. EusCrawl was built using\nad-hoc scrapers to extract text from 33 Basque websites with\nhigh-quality content, resulting in cleaner text compared to general\npurpose approaches.\n\nWe do not claim ownership of any document in the corpus. All documents\nwe collected were published under a Creative Commons license in their\noriginal website, and the specific variant can be found in the\n\"license\" field of each document. Should you consider\nthat our data contains material that is owned by you and you would not\nlike to be reproduced here, please contact Aitor Soroa at\na.soroa@ehu.eus.\n\nFor more details about the corpus, refer to our paper \"Artetxe M.,\nAldabe I., Agerri R., Perez-de-Vi\u00f1aspre O, Soroa A. (2022). Does\nCorpus Quality Really Matter for Low-Resource Languages?\"\nhttps://arxiv.org/abs/2203.08111\n\nIf you use our corpus or models for academic research, please cite the paper in question:\n@misc{artetxe2022euscrawl,\n title={Does corpus quality really matter for low-resource languages?},\n author={Mikel Artetxe, Itziar Aldabe, Rodrigo Agerri, Olatz Perez-de-Vi\u00f1aspre, Aitor Soroa},\n year={2022},\n eprint={2203.08111},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n\nFor questions please contact Aitor Soroa at a.soroa@ehu.eus.", "evaluation_metadata": {}}, "Matilde/Homo_ita": {"name": "Matilde/Homo_ita", "description": "Grapheme-to-Phoneme training, validation and test sets", "evaluation_metadata": {}}, "Krystalan/xmediasum": {"name": "Krystalan/xmediasum", "description": "We present XMediaSum, a cross-lingual dialogue summarization dataset with 40K English(dialogues)->Chinese(summaries) and 40K English (dialogues)->German(summaries) samples. XMediaSum is created by manually translating the English summaries of MediaSum (a English monolingual dialogue summarization dataset) to both Chinese and German.", "evaluation_metadata": {}}, "GEM/xmediasum": {"name": "GEM/xmediasum", "description": "\\\r\nWe present XMediaSum, a cross-lingual dialogue summarization dataset with 40K English(dialogues)->Chinese(summaries) and 40K English (dialogues)->German(summaries) samples. XMediaSum is created by manually translating the English summaries of MediaSum (a English monolingual dialogue summarization dataset) to both Chinese and German.", "evaluation_metadata": {}}, "TobiTob/CityLearn": {"name": "TobiTob/CityLearn", "description": "The dataset consists of tuples of (observations, actions, rewards, dones) sampled by agents\n interacting with the CityLearn 2022 Phase 1 environment (only first 5 buildings)", "evaluation_metadata": {}}, "shmuhammad/AfriSenti-twitter-sentiment": {"name": "shmuhammad/AfriSenti-twitter-sentiment", "description": "AfriSenti is the largest sentiment analysis benchmark dataset for under-represented African languages---covering 110,000+ annotated tweets in 14 African languages (Amharic, Algerian Arabic, Hausa, Igbo, Kinyarwanda, Moroccan Arabic, Mozambican Portuguese, Nigerian Pidgin, Oromo, Swahili, Tigrinya, Twi, Xitsonga, and yoruba).", "evaluation_metadata": {}}, "Gholamreza/pquad": {"name": "Gholamreza/pquad", "description": "\\\\\\PQuAD: PQuAD is a crowd-sourced reading comprehension dataset on Persian Language.", "evaluation_metadata": [{"config": "pquad", "task": "question-answering", "task_id": "extractive_question_answering", "splits": {"train_split": "train", "eval_split": "validation"}, "col_mapping": {"question": "question", "context": "context", "answers": {"text": "text", "answer_start": "answer_start"}}, "metrics": [{"type": "pquad", "name": "PQuAD"}]}]}, "matchbench/selfkg-dwy100k-dbpyg": {"name": "matchbench/selfkg-dwy100k-dbpyg", "description": "# DWY100k-yg is a large-scale monolingual dataset extracted from DBpedia and YAGO3. The suffix yg means DBpedia \n# to YAGO3. And DWY100k-yg has 100,000 reference entity alignments.\n#", "evaluation_metadata": {}}, "theblackcat102/alexa-qa-with-rank": {"name": "theblackcat102/alexa-qa-with-rank", "description": "Alexa question and answer examples with rank", "evaluation_metadata": {}}, "gorrox14/TxoriakTxori": {"name": "gorrox14/TxoriakTxori", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "nanaaaa/emotion_chinese_english": {"name": "nanaaaa/emotion_chinese_english", "description": "The emotion_chinese_english dataset is a multilingual emotion dataset annotated by language experts under a project. The dataset can be used for tasks such as multilingual (Chinese and English) emotion classification and identification.", "evaluation_metadata": {}}, "t0mmy/livedoor_news_corpus": {"name": "t0mmy/livedoor_news_corpus", "description": "This corpus is from news stories in \u201clivedoor news\u201d administered by NHN Japan and only the following ones that are governed by Creative Commons license were collected and had as many HTML tags as possible deleted.", "evaluation_metadata": {}}, "Achitha/10th_science_tamil_to_english": {"name": "Achitha/10th_science_tamil_to_english", "description": "The data contains roughly one and half hours of audio and transcripts in Tamil language.", "evaluation_metadata": {}}, "semeru/completeformer": {"name": "semeru/completeformer", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "wwydmanski/colorectal-carcinoma-microbiome-fengq": {"name": "wwydmanski/colorectal-carcinoma-microbiome-fengq", "description": "The dataset contains 16S rRNA gene sequencing data from healthy controls and colorectal cancer patients. The dataset was used in the paper \"Gut microbiome development along the colorectal adenoma-carcinoma sequence\" by Feng et al. (2015).", "evaluation_metadata": {}}, "hezarai/sentiment_digikala_snappfood": {"name": "hezarai/sentiment_digikala_snappfood", "description": "Sentiment analysis dataset extracted and labeled from Digikala and Snapp Food comments", "evaluation_metadata": {}}, "gorrox14/image-demo": {"name": "gorrox14/image-demo", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "nanaaaa/emotion_english": {"name": "nanaaaa/emotion_english", "description": "The emotion_english dataset is an emotion dataset annotated by language experts under a project. The dataset can be used for tasks such as English emotion classification and identification.", "evaluation_metadata": {}}, "marcolin/demo": {"name": "marcolin/demo", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "shunk031/JGLUE": {"name": "shunk031/JGLUE", "description": "JGLUE, Japanese General Language Understanding Evaluation, is built to measure the general NLU ability in Japanese. JGLUE has been constructed from scratch without translation. We hope that JGLUE will facilitate NLU research in Japanese.", "evaluation_metadata": {}}, "aadhiya/image-upoload": {"name": "aadhiya/image-upoload", "description": "Demo dataset for testing or showing image-text capabilities.", "evaluation_metadata": {}}, "aadhiya/image-test": {"name": "aadhiya/image-test", "description": "Demo dataset for testing or showing image-text capabilities.", "evaluation_metadata": {}}, "gavincapriola/botpegs": {"name": "gavincapriola/botpegs", "description": "A small sample of image-text pairs from the BotPegs dataset.", "evaluation_metadata": {}}, "OpenDILabCommunity/Pong-v4-expert-MCTS": {"name": "OpenDILabCommunity/Pong-v4-expert-MCTS", "description": "Data sampled from an efficient-zero policy in the pong environment. The MCTS hidden state is included in the dataset.", "evaluation_metadata": {}}, "MultiCoNER/multiconer_v2": {"name": "MultiCoNER/multiconer_v2", "description": "Complex named entities (NE), like the titles of creative works, are not simple nouns and pose challenges for NER systems (Ashwini and Choi, 2014). They can take the form of any linguistic constituent, like an imperative clause (\u201cDial M for Murder\u201d), and do not look like traditional NEs (Persons, Locations, etc.). This syntactic ambiguity makes it challenging to recognize them based on context. We organized the MultiCoNER task (Malmasi et al., 2022) at SemEval-2022 to address these challenges in 11 languages, receiving a very positive community response with 34 system papers. Results confirmed the challenges of processing complex and long-tail NEs: even the largest pre-trained Transformers did not achieve top performance without external knowledge. The top systems infused transformers with knowledge bases and gazetteers. However, such solutions are brittle against out of knowledge-base entities and noisy scenarios like the presence of spelling mistakes and typos. We propose MultiCoNER II which represents novel challenges through new tasks that emphasize the shortcomings of the current top models.\n\nMultiCoNER II features complex NER in these languages:\n\n1. English\n2. Spanish\n3. Hindi\n4. Bangla\n5. Chinese\n6. Swedish\n7. Farsi\n8. French\n9. Italian\n10. Portugese\n11. Ukranian\n12. German\n\nFor more details see https://multiconer.github.io/\n\n## References\n* Sandeep Ashwini and Jinho D. Choi. 2014. Targetable named entity recognition in social media. CoRR, abs/1408.0782.\n* Shervin Malmasi, Anjie Fang, Besnik Fetahu, Sudipta Kar, Oleg Rokhlenko. 2022. SemEval-2022 Task 11: Multilingual Complex Named Entity Recognition (MultiCoNER).", "evaluation_metadata": {}}, "HuggingFaceH4/hhh_alignment": {"name": "HuggingFaceH4/hhh_alignment", "description": "This task evaluates language models on alignment, broken down into categories of helpfulness, honesty/accuracy, harmlessness, and other. The evaluations imagine a conversation between a person and a language model assistant. The goal with these evaluations is that on careful reflection, the vast majority of people would agree that the chosen response is better (more helpful, honest, and harmless) than the alternative offered for comparison. The task is formatted in terms of binary choices, though many of these have been broken down from a ranked ordering of three or four possible responses.", "evaluation_metadata": {}}, "zeusfsx/ukrainian-news": {"name": "zeusfsx/ukrainian-news", "description": "Ukrainian News Dataset\n\nThis is a dataset of news articles downloaded from various Ukrainian websites and Telegram channels. The dataset contains approximately ~23M JSON objects (news)", "evaluation_metadata": {}}, "krr-oxford/OntoLAMA": {"name": "krr-oxford/OntoLAMA", "description": "OntoLAMA: LAnguage Model Analysis datasets for Ontology Subsumption Inference.", "evaluation_metadata": {}}, "vocabtrimmer/mc4_validation": {"name": "vocabtrimmer/mc4_validation", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.", "evaluation_metadata": {}}, "oscar-corpus/OSCAR-2301": {"name": "oscar-corpus/OSCAR-2301", "description": "The Open Super-large Crawled Aggregated coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the Ungoliant architecture.\\", "evaluation_metadata": {}}, "yizhongw/self_instruct": {"name": "yizhongw/self_instruct", "description": "Self-Instruct is a dataset that contains 52k instructions, paired with 82K instance inputs and outputs. This instruction data can be used to conduct instruction-tuning for language models and make the language model follow instruction better.", "evaluation_metadata": {}}, "HuggingFaceH4/helpful_instructions": {"name": "HuggingFaceH4/helpful_instructions", "description": "Helpful Instructions is a dataset of (prompt, completion) pairs that are derived from a variety of public datasets. As the name suggests, it focuses on instructions that are \"helpful\", i.e. the kind of questions or tasks a human user might instruct an AI assistant to perform.", "evaluation_metadata": {}}, "ELiRF/dacsa": {"name": "ELiRF/dacsa", "description": "The Dataset for Automatic summarization of Catalan and Spanish newspaper\nArticles (DACSA) corpus. It is a high-quality large-scale corpus that can be\nused to train summarization models for Catalan and Spanish. The data provides\npairs of news article and its summary from different newspapers for both, the\nCatalan and the Spanish languages. Regarding the Catalan set, there are 725,184\nsample pairs from 9 newspapers, regarding the Spanish set, the corpus provides\n2,120,649 sample pairs from 21 newspapers.", "evaluation_metadata": {}}, "KBLab/rixvox": {"name": "KBLab/rixvox", "description": "RixVox is a speech dataset comprised of speeches from the Swedish Parliament (the Riksdag). Audio from speeches have been aligned with official transcripts, on the sentence level, using aeneas. \nSpeaker metadata is available for each observation, including the speaker's name, gender, party, birth year and electoral district. The dataset contains a total of 5493 hours of speech. \nAn observation may consist of one or several sentences (up to 30 seconds in duration).", "evaluation_metadata": {}}, "martinjosifoski/SynthIE": {"name": "martinjosifoski/SynthIE", "description": "The paper ``Exploiting Asymmetry for Synthetic Training Data Generation: SynthIE and the Case of Information Extraction'' builds on the idea that even for hard tasks of interest (with input X and Y) -- for which human-annotation is not practical and high-quality annotated data is not available -- by reversing the task (from Y to X), useful data can be synthetically generated even when that original task cannot be solved directly by the LLM. This process enables the creation of a high-quality dataset of X-Y pairs that will enable the training/fine-tuning of models for the original task of interest.\nIn particular, the paper studies the idea in the context of closed information extraction (IE), where a model is tasked with extracting the exhaustive set of facts expressed in natural language text. The synthetic data generation pipeline proposed in the paper comprises three primary components: (i) construction of a knowledge graph containing the entities and relations of interest; (ii) sampling of coherent triplet sets from the KG with comprehensive coverage of the entities and relations, and (iii) generation of high-quality text, expressing the triplets without any supplementary information.", "evaluation_metadata": {}}, "HuggingFaceH4/instruct_me": {"name": "HuggingFaceH4/instruct_me", "description": "Instruct Me is a dataset of instruction-like dialogues between a human user and AI assistant. The prompts are derived from (prompt, completion) pairs in the Helpful Instructions dataset. The goal is to train a language model to that is \"chatty\" and can answer the kind of questions or tasks a human user might instruct an AI assistant to perform.", "evaluation_metadata": {}}, "timworks/massive-dataset": {"name": "timworks/massive-dataset", "description": "MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\nfor the Natural Language Understanding tasks of intent prediction and slot annotation.\nUtterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\nthe SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "evaluation_metadata": {}}, "victorcosta/pt_legislation": {"name": "victorcosta/pt_legislation", "description": "Dataset for researching the application of data-centric NLP techniques on Portuguese legislation.", "evaluation_metadata": {}}, "NawinCom/Eye_diabetic": {"name": "NawinCom/Eye_diabetic", "description": "Beans is a dataset of images of beans taken in the field using smartphone\ncameras. It consists of 3 classes: 2 disease classes and the healthy class.\nDiseases depicted include Angular Leaf Spot and Bean Rust. Data was annotated\nby experts from the National Crops Resources Research Institute (NaCRRI) in\nUganda and collected by the Makerere AI research lab.", "evaluation_metadata": {}}, "kanishka/comps": {"name": "kanishka/comps", "description": "COMPS is a dataset of minimal pair sentences in English that enables the \ntesting knowledge of concepts and their properties in language models (LMs).\nSpecifically, it tests the ability of LMs to attribute properties to everyday \nconcepts, and demonstrate reasoning compatible with property inheritance, where\nsubordinate concepts inherit the properties of their superordinate (hypernyms).", "evaluation_metadata": {}}, "hendrycks/ethics": {"name": "hendrycks/ethics", "description": "A benchmark that spans concepts in justice, well-being, duties, virtues, and commonsense morality.", "evaluation_metadata": {}}, "bbaaaa/iwslt14-de-en": {"name": "bbaaaa/iwslt14-de-en", "description": "The IWSLT 2017 Multilingual Task addresses text translation, including zero-shot translation, with a single MT system across all directions including English, German, Dutch, Italian and Romanian. As unofficial task, conventional bilingual text translation is offered between English and Arabic, French, Japanese, Chinese, German and Korean.", "evaluation_metadata": {}}, "larrylawl/multilexnorm": {"name": "larrylawl/multilexnorm", "description": "For this task, participants are asked to develop a system that performs lexical normalization: the conversion of non-canonical texts to their canonical equivalent form. In particular, this task includes data from 12 languages.", "evaluation_metadata": {}}, "andstor/the_pile_github": {"name": "andstor/the_pile_github", "description": "The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality\ndatasets combined together.", "evaluation_metadata": {}}, "djstrong/oscar-small": {"name": "djstrong/oscar-small", "description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.\\", "evaluation_metadata": {}}, "Joanne/Unified_Benchmark_for_Metaphor_Identification": {"name": "Joanne/Unified_Benchmark_for_Metaphor_Identification", "description": "[Unified Benchmark for Metaphor Identification]", "evaluation_metadata": {}}, "EleutherAI/arithmetic": {"name": "EleutherAI/arithmetic", "description": "A small battery of 10 tests that involve asking language models a simple arithmetic\nproblem in natural language.", "evaluation_metadata": {}}, "rcds/occlusion_swiss_judgment_prediction": {"name": "rcds/occlusion_swiss_judgment_prediction", "description": "This dataset contains an implementation of occlusion for the SwissJudgmentPrediction task.", "evaluation_metadata": {}}, "polinaeterna/duorc": {"name": "polinaeterna/duorc", "description": "DuoRC contains 186,089 unique question-answer pairs created from a collection of 7680 pairs of movie plots where each pair in the collection reflects two versions of the same movie.", "evaluation_metadata": {}}, "gjuggler/bird-data": {"name": "gjuggler/bird-data", "description": "We worked with citizen scientists and domainexperts to collect NABirds, a new high\nquality dataset containing 48,562 images of North American birds with 555 \ncategories, part annotations and bounding boxes.", "evaluation_metadata": {}}, "webnlg/challenge-2023": {"name": "webnlg/challenge-2023", "description": "The WebNLG challenge consists in mapping data to text. The training data consists\nof Data/Text pairs where the data is a set of triples extracted from DBpedia and the text is a verbalisation\nof these triples. For instance, given the 3 DBpedia triples shown in (a), the aim is to generate a text such as (b).\n\na. (John_E_Blaha birthDate 1942_08_26) (John_E_Blaha birthPlace San_Antonio) (John_E_Blaha occupation Fighter_pilot)\nb. John E Blaha, born in San Antonio on 1942-08-26, worked as a fighter pilot\n\nAs the example illustrates, the task involves specific NLG subtasks such as sentence segmentation\n(how to chunk the input data into sentences), lexicalisation (of the DBpedia properties),\naggregation (how to avoid repetitions) and surface realisation\n(how to build a syntactically correct and natural sounding text).", "evaluation_metadata": {}}, "intfloat/query2doc_msmarco": {"name": "intfloat/query2doc_msmarco", "description": "This dataset contains GPT-3.5 (text-davinci-003) generations from MS-MARCO queries.", "evaluation_metadata": {}}, "EleutherAI/wikitext_document_level": {"name": "EleutherAI/wikitext_document_level", "description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.", "evaluation_metadata": {}}, "rcds/lower_court_insertion_swiss_judgment_prediction": {"name": "rcds/lower_court_insertion_swiss_judgment_prediction", "description": "This dataset contains an implementation of lower court insertion for the SwissJudgmentPrediction task.", "evaluation_metadata": {}}, "RGBD-SOD/test": {"name": "RGBD-SOD/test", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "Joanne/Metaphors_and_Analogies": {"name": "Joanne/Metaphors_and_Analogies", "description": "[Unified Benchmark for Metaphor Identification]", "evaluation_metadata": {}}, "thewall/jolma": {"name": "thewall/jolma", "description": "PRJEB3289\nhttps://www.ebi.ac.uk/ena/browser/view/PRJEB3289\nData that has been generated by HT-SELEX experiments (see Jolma et al. 2010. PMID: 20378718 for description of method) that has been now used to generate transcription factor binding specificity models for most of the high confidence human transcription factors. Sequence data is composed of reads generated with Illumina Genome Analyzer IIX and HiSeq2000 instruments. Samples are composed of single read sequencing of synthetic DNA fragments with a fixed length randomized region or samples derived from such a initial library by selection with a sequence specific DNA binding protein. Originally multiple samples with different \"barcode\" tag sequences were run on the same Illumina sequencing lane but the released files have been already de-multiplexed, and the constant regions and \"barcodes\" of each sequence have been cut out of the sequencing reads to facilitate the use of data. Some of the files are composed of reads from multiple different sequencing lanes and due to this each of the names of the individual reads have been edited to show the flowcell and lane that was used to generate it. Barcodes and oligonucleotide designs are indicated in the names of individual entries. Depending of the selection ligand design, the sequences in each of these fastq-files are either 14, 20, 30 or 40 bases long and had different flanking regions in both sides of the sequence. Each run entry is named in either of the following ways: Example 1) \"BCL6B_DBD_AC_TGCGGG20NGA_1\", where name is composed of following fields ProteinName_CloneType_Batch_BarcodeDesign_SelectionCycle. This experiment used barcode ligand TGCGGG20NGA, where both of the variable flanking constant regions are indicated as they were on the original sequence-reads. This ligand has been selected for one round of HT-SELEX using recombinant protein that contained the DNA binding domain of human transcription factor BCL6B. It also tells that the experiment was performed on batch of experiments named as \"AC\". Example 2) 0_TGCGGG20NGA_0 where name is composed of (zero)_BarcodeDesign_(zero) These sequences have been generated from sequencing of the initial non-selected pool. Same initial pools have been used in multiple experiments that were on different batches, thus for example this background sequence pool is the shared background for all of the following samples. BCL6B_DBD_AC_TGCGGG20NGA_1, ZNF784_full_AE_TGCGGG20NGA_3, DLX6_DBD_Y_TGCGGG20NGA_4 and MSX2_DBD_W_TGCGGG20NGA_2", "evaluation_metadata": {}}, "thewall/jolma_unique": {"name": "thewall/jolma_unique", "description": "PRJEB3289\nhttps://www.ebi.ac.uk/ena/browser/view/PRJEB3289\nData that has been generated by HT-SELEX experiments (see Jolma et al. 2010. PMID: 20378718 for description of method) that has been now used to generate transcription factor binding specificity models for most of the high confidence human transcription factors. Sequence data is composed of reads generated with Illumina Genome Analyzer IIX and HiSeq2000 instruments. Samples are composed of single read sequencing of synthetic DNA fragments with a fixed length randomized region or samples derived from such a initial library by selection with a sequence specific DNA binding protein. Originally multiple samples with different \"barcode\" tag sequences were run on the same Illumina sequencing lane but the released files have been already de-multiplexed, and the constant regions and \"barcodes\" of each sequence have been cut out of the sequencing reads to facilitate the use of data. Some of the files are composed of reads from multiple different sequencing lanes and due to this each of the names of the individual reads have been edited to show the flowcell and lane that was used to generate it. Barcodes and oligonucleotide designs are indicated in the names of individual entries. Depending of the selection ligand design, the sequences in each of these fastq-files are either 14, 20, 30 or 40 bases long and had different flanking regions in both sides of the sequence. Each run entry is named in either of the following ways: Example 1) \"BCL6B_DBD_AC_TGCGGG20NGA_1\", where name is composed of following fields ProteinName_CloneType_Batch_BarcodeDesign_SelectionCycle. This experiment used barcode ligand TGCGGG20NGA, where both of the variable flanking constant regions are indicated as they were on the original sequence-reads. This ligand has been selected for one round of HT-SELEX using recombinant protein that contained the DNA binding domain of human transcription factor BCL6B. It also tells that the experiment was performed on batch of experiments named as \"AC\". Example 2) 0_TGCGGG20NGA_0 where name is composed of (zero)_BarcodeDesign_(zero) These sequences have been generated from sequencing of the initial non-selected pool. Same initial pools have been used in multiple experiments that were on different batches, thus for example this background sequence pool is the shared background for all of the following samples. BCL6B_DBD_AC_TGCGGG20NGA_1, ZNF784_full_AE_TGCGGG20NGA_3, DLX6_DBD_Y_TGCGGG20NGA_4 and MSX2_DBD_W_TGCGGG20NGA_2", "evaluation_metadata": {}}, "thewall/jolma_subset": {"name": "thewall/jolma_subset", "description": "PRJEB3289\nhttps://www.ebi.ac.uk/ena/browser/view/PRJEB3289\nData that has been generated by HT-SELEX experiments (see Jolma et al. 2010. PMID: 20378718 for description of method) that has been now used to generate transcription factor binding specificity models for most of the high confidence human transcription factors. Sequence data is composed of reads generated with Illumina Genome Analyzer IIX and HiSeq2000 instruments. Samples are composed of single read sequencing of synthetic DNA fragments with a fixed length randomized region or samples derived from such a initial library by selection with a sequence specific DNA binding protein. Originally multiple samples with different \"barcode\" tag sequences were run on the same Illumina sequencing lane but the released files have been already de-multiplexed, and the constant regions and \"barcodes\" of each sequence have been cut out of the sequencing reads to facilitate the use of data. Some of the files are composed of reads from multiple different sequencing lanes and due to this each of the names of the individual reads have been edited to show the flowcell and lane that was used to generate it. Barcodes and oligonucleotide designs are indicated in the names of individual entries. Depending of the selection ligand design, the sequences in each of these fastq-files are either 14, 20, 30 or 40 bases long and had different flanking regions in both sides of the sequence. Each run entry is named in either of the following ways: Example 1) \"BCL6B_DBD_AC_TGCGGG20NGA_1\", where name is composed of following fields ProteinName_CloneType_Batch_BarcodeDesign_SelectionCycle. This experiment used barcode ligand TGCGGG20NGA, where both of the variable flanking constant regions are indicated as they were on the original sequence-reads. This ligand has been selected for one round of HT-SELEX using recombinant protein that contained the DNA binding domain of human transcription factor BCL6B. It also tells that the experiment was performed on batch of experiments named as \"AC\". Example 2) 0_TGCGGG20NGA_0 where name is composed of (zero)_BarcodeDesign_(zero) These sequences have been generated from sequencing of the initial non-selected pool. Same initial pools have been used in multiple experiments that were on different batches, thus for example this background sequence pool is the shared background for all of the following samples. BCL6B_DBD_AC_TGCGGG20NGA_1, ZNF784_full_AE_TGCGGG20NGA_3, DLX6_DBD_Y_TGCGGG20NGA_4 and MSX2_DBD_W_TGCGGG20NGA_2", "evaluation_metadata": {}}, "nanaaaa/BilingualChildrenEmo": {"name": "nanaaaa/BilingualChildrenEmo", "description": "The BilingualChildrenEmo dataset is a multilingual emotion dataset annotated by language experts under a project. The dataset can be used for tasks such as multilingual (Chinese and English) emotion classification and identification.", "evaluation_metadata": {}}, "oscar-corpus/oscar-2301-hpc": {"name": "oscar-corpus/oscar-2301-hpc", "description": "The Open Super-large Crawled Aggregated coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the Ungoliant architecture.\\", "evaluation_metadata": {}}, "CATIE-AQ/frenchQA": {"name": "CATIE-AQ/frenchQA", "description": "One French QA Dataset to rule them all, One French QA Dataset to find them, One French QA Dataset to bring them all, and in the darkness bind them.", "evaluation_metadata": {}}, "mxeval/mbxp": {"name": "mxeval/mbxp", "description": "A collection of execution-based multi-lingual benchmark for code generation.", "evaluation_metadata": {}}, "mxeval/multi-humaneval": {"name": "mxeval/multi-humaneval", "description": "A collection of execution-based multi-lingual benchmark for code generation.", "evaluation_metadata": {}}, "mxeval/mathqa-x": {"name": "mxeval/mathqa-x", "description": "A collection of execution-based multi-lingual benchmark for code generation.", "evaluation_metadata": {}}, "mxeval/mxeval": {"name": "mxeval/mxeval", "description": "A collection of execution-based multi-lingual benchmark for code generation.", "evaluation_metadata": {}}, "cartesinus/leyzer-fedcsis": {"name": "cartesinus/leyzer-fedcsis", "description": " Leyzer is a multilingual text corpus designed to study multilingual and cross-lingual natural language\n understanding (NLU) models and the strategies of localization of virtual assistants. It consists of 20\n domains across three languages: English, Spanish and Polish, with 186 intents and a wide range of\n samples, ranging from 1 to 672 sentences per intent.", "evaluation_metadata": {}}, "mweiss/mnist_ambiguous": {"name": "mweiss/mnist_ambiguous", "description": "The images were created such that they have an unclear ground truth, \ni.e., such that they are similar to multiple - but not all - of the datasets classes.\nRobust and uncertainty-aware models should be able to detect and flag these ambiguous images.\nAs such, the dataset should be merged / mixed with the original dataset and we\nprovide such 'mixed' splits for convenience. Please refer to the dataset card for details.", "evaluation_metadata": {}}, "mweiss/fashion_mnist_ambiguous": {"name": "mweiss/fashion_mnist_ambiguous", "description": "The images were created such that they have an unclear ground truth, \ni.e., such that they are similar to multiple - but not all - of the datasets classes.\nRobust and uncertainty-aware models should be able to detect and flag these ambiguous images.\nAs such, the dataset should be merged / mixed with the original dataset and we\nprovide such 'mixed' splits for convenience. Please refer to the dataset card for details.", "evaluation_metadata": {}}, "orkg/SciQA": {"name": "orkg/SciQA", "description": " SciQA contains 2,565 SPARQL query - question pairs along with answers fetched from the open research knowledge graph (ORKG) via a Virtuoso SPARQL endpoint, it is a collection of both handcrafted and autogenerated questions and queries. The dataset is split into 70% training, 10% validation and 20% test examples. The dataset is available as JSON files.", "evaluation_metadata": {}}, "shunk031/CAMERA": {"name": "shunk031/CAMERA", "description": "CAMERA (CyberAgent Multimodal Evaluation for Ad Text GeneRAtion) is the Japanese ad text generation dataset.", "evaluation_metadata": {}}, "khalidalt/model-written-evals": {"name": "khalidalt/model-written-evals", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "KETI-AIR/coco": {"name": "KETI-AIR/coco", "description": "COCO is a large-scale object detection, segmentation, and\ncaptioning dataset.\nNote:\n * Some images from the train and validation sets don't have annotations.\n * Coco 2014 and 2017 uses the same images, but different train/val/test splits\n * The test split don't have any annotations (only images).\n * Coco defines 91 classes but the data only uses 80 classes.\n * Panotptic annotations defines defines 200 classes but only uses 133.", "evaluation_metadata": {}}, "theblackcat102/codex-math-qa": {"name": "theblackcat102/codex-math-qa", "description": "Solution by codex-davinci-002 for math_qa", "evaluation_metadata": {}}, "intfloat/multilingual_cc_news": {"name": "intfloat/multilingual_cc_news", "description": "\\\r\nMultilingual CC-News dataset.\r\n\r\nThis is the processed version from https://huggingface.co/datasets/CloverSearch/cc-news-mutlilingual.", "evaluation_metadata": {}}, "semeru/completeformer_java_data": {"name": "semeru/completeformer_java_data", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "HiTZ/alpaca_mt": {"name": "HiTZ/alpaca_mt", "description": "Alpaca is a dataset of 52,000 instructions and demonstrations generated by OpenAI's text-davinci-003 engine. This instruction data can be used to conduct instruction-tuning for language models and make the language model follow instruction better. This dataset also includes machine-translated data for 6 Iberian languages: Portuguese, Spanish, Catalan, Basque, Galician and Asturian.", "evaluation_metadata": {}}, "qanastek/MORFITT": {"name": "qanastek/MORFITT", "description": "This article presents MORFITT, the first multi-label corpus in French annotated in\nspecialties in the medical field. MORFITT is composed of 3~624 abstracts of scientific\narticles from PubMed, annotated in 12 specialties for a total of 5,116 annotations. \nWe detail the corpus, the experiments and the preliminary results obtained using a \nclassifier based on the pre-trained language model CamemBERT. These preliminary results\ndemonstrate the difficulty of the task, with a weighted average F1-score of 61.78%.", "evaluation_metadata": {}}, "gabeorlanski/tp3": {"name": "gabeorlanski/tp3", "description": "Translating Python Programming Puzzles (TP3) is a code translation benchmark created from the verification functions from the questions in the original Python Programming Puzzles dataset (Schuster et al., 2021) to create this dataset. These functions are hand-crafted by the authors and are used to check if an answer satisfies the constraints of the puzzle. These puzzles range in difficulty from basic character checking to competitive programming problems. Thus, each verification function is written by an expert python programmer and requires a significant understanding of programming to translate. In total, there are 370 python functions to translate.", "evaluation_metadata": {}}, "slhenty/climate-fever-nli-stsb": {"name": "slhenty/climate-fever-nli-stsb", "description": "A modified CLIMATE-FEVER dataset that includes NLI-style features and STSb-features suitable for SentenceBERT training scripts.", "evaluation_metadata": {}}, "ujs/hinglish": {"name": "ujs/hinglish", "description": "A Hugginface version of the Hindi-English code-switched dataset from OpenSLR-104.", "evaluation_metadata": {}}, "rcds/swiss_judgment_prediction_xl": {"name": "rcds/swiss_judgment_prediction_xl", "description": "This dataset contains court decision for judgment prediction task.", "evaluation_metadata": {}}, "jarvisx17/jxner": {"name": "jarvisx17/jxner", "description": "WNUT 17: Emerging and Rare entity recognition\nThis shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions.\nNamed entities form the basis of many modern approaches to other tasks (like event clustering and summarisation),\nbut recall on them is a real problem in noisy text - even among annotators. This drop tends to be due to novel entities and surface forms.\nTake for example the tweet \u201cso.. kktny in 30 mins?\u201d - even human experts find entity kktny hard to detect and resolve.\nThis task will evaluate the ability to detect and classify novel, emerging, singleton named entities in noisy text.\nThe goal of this task is to provide a definition of emerging and of rare entities, and based on that, also datasets for detecting these entities.", "evaluation_metadata": {}}, "hieuhocnlp/deep-research": {"name": "hieuhocnlp/deep-research", "description": "This dataset is for research at DeepUSC.", "evaluation_metadata": {}}, "proofcheck/prooflang": {"name": "proofcheck/prooflang", "description": "\\ The ProofLang Corpus includes over three million\nEnglish-language proofs\u2014558 million words\u2014mechanically extracted from the papers\n(Math, CS, Physics, etc.) posted on arXiv.org between 1992 and 2020. The focus\nof this corpus is written proofs, not the explanatory text that surrounds them,\nand more specifically on the language used in such proofs; mathematical\ncontent is filtered out, resulting in sentences such as ``Let MATH be\nthe restriction of MATH to MATH.'' This dataset reflects how people prefer to\nwrite informal proofs. It is also amenable to statistical analyses and to\nexperiments with Natural Language Processing (NLP) techniques.", "evaluation_metadata": {}}, "rcds/swiss_law_area_prediction": {"name": "rcds/swiss_law_area_prediction", "description": "This dataset contains court decision for law area prediction task.", "evaluation_metadata": {}}, "AlexFierro9/imagenet-1k_test": {"name": "AlexFierro9/imagenet-1k_test", "description": "ILSVRC 2012, commonly known as 'ImageNet' is an image dataset organized according to the WordNet hierarchy. Each meaningful concept in WordNet, possibly described by multiple words or word phrases, is called a \"synonym set\" or \"synset\". There are more than 100,000 synsets in WordNet, majority of them are nouns (80,000+). ImageNet aims to provide on average 1000 images to illustrate each synset. Images of each concept are quality-controlled and human-annotated. In its completion, ImageNet hopes to offer tens of millions of cleanly sorted images for most of the concepts in the WordNet hierarchy. ImageNet 2012 is the most commonly used subset of ImageNet. This dataset spans 1000 object classes and contains 1,281,167 training images, 50,000 validation images and 100,000 test images", "evaluation_metadata": {}}, "bbaaaa/iwslt14-de-en-preprocess": {"name": "bbaaaa/iwslt14-de-en-preprocess", "description": "The IWSLT 2017 Multilingual Task addresses text translation, including zero-shot translation, with a single MT system across all directions including English, German, Dutch, Italian and Romanian. As unofficial task, conventional bilingual text translation is offered between English and Arabic, French, Japanese, Chinese, German and Korean.", "evaluation_metadata": {}}, "nurik040404/mse": {"name": "nurik040404/mse", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "EMBO/SourceData": {"name": "EMBO/SourceData", "description": " This dataset is based on the SourceData database and is intented to facilitate training of NLP tasks in the cell and molecualr biology domain.", "evaluation_metadata": {}}, "koutch/staqc": {"name": "koutch/staqc", "description": "StaQC (Stack Overflow Question-Code pairs) is a dataset of around 148K Python and 120K SQL domain question-code pairs, \nwhich are automatically mined from Stack Overflow using a Bi-View Hierarchical Neural Network, \nas described in the paper \"StaQC: A Systematically Mined Question-Code Dataset from Stack Overflow\" (WWW'18).", "evaluation_metadata": {}}, "RussianNLP/rucola": {"name": "RussianNLP/rucola", "description": "Russian Corpus of Linguistic Acceptability (RuCoLA) is a novel benchmark of 13.4k sentences labeled as acceptable or not. RuCoLA combines in-domain sentences manually collected from linguistic literature and out-of-domain sentences produced by nine machine translation and paraphrase generation models. The motivation behind the out-of-domain set is to facilitate the practical use of acceptability judgments for improving language generation. Each unacceptable sentence is additionally labeled with four standard and machine-specific coarse-grained categories: morphology, syntax, semantics, and hallucinations.", "evaluation_metadata": {}}, "vldsavelyev/guitar_tab": {"name": "vldsavelyev/guitar_tab", "description": "Dataset of music tablature, in alphaTex (https://alphatab.net/docs/alphatex) \nformat, converted from Guitar Pro files (gp3, gp4, gp5, which are downloaded \nfrom https://rutracker.org/forum/viewtopic.php?t=2888130", "evaluation_metadata": {}}, "cartesinus/leyzer-fedcsis-translated": {"name": "cartesinus/leyzer-fedcsis-translated", "description": " Leyzer is a multilingual text corpus designed to study multilingual and cross-lingual natural language\n understanding (NLU) models and the strategies of localization of virtual assistants. It consists of 20\n domains across three languages: English, Spanish and Polish, with 186 intents and a wide range of\n samples, ranging from 1 to 672 sentences per intent.", "evaluation_metadata": {}}, "casehold/casehold": {"name": "casehold/casehold", "description": "CaseHOLD (Case Holdings On Legal Decisions) is a law dataset comprised of over 53,000+ multiple choice questions to identify the relevant holding of a cited case.", "evaluation_metadata": {}}, "enoriega/odinsynth_sequence_dataset": {"name": "enoriega/odinsynth_sequence_dataset", "description": "Dataset for Odinsynth sequence data generation", "evaluation_metadata": {}}, "koutch/JuICe": {"name": "koutch/JuICe", "description": "JuICe, a corpus of 1.5 million examples with a curated test set of 3.7K instances based on online programming assignments.", "evaluation_metadata": {}}, "swype/instruct": {"name": "swype/instruct", "description": "A dataset containing prompt and completion pairs for various tasks.", "evaluation_metadata": {}}, "jarvisx17/label_data": {"name": "jarvisx17/label_data", "description": "WNUT 17: Emerging and Rare entity recognition\nThis shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions.\nNamed entities form the basis of many modern approaches to other tasks (like event clustering and summarisation),\nbut recall on them is a real problem in the noisy text - even among annotators. This drop tends to be due to novel entities and surface forms.\nTake for example the tweet \u201cso.. kktny in 30 mins?\u201d - even human experts find entity kktny hard to detect and resolve.\nThis task will evaluate the ability to detect and classify novel, emerging, singleton-named entities in noisy text.\nThe goal of this task is to provide a definition of emerging and rare entities, and based on that, also datasets for detecting these entities.", "evaluation_metadata": {}}, "distil-whisper/librispeech_asr": {"name": "distil-whisper/librispeech_asr", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "TurkuNLP/Suomi24-toxicity-annotated": {"name": "TurkuNLP/Suomi24-toxicity-annotated", "description": "This dataset consists of Suomi24 comments which have been labeled by human raters for toxic behavior.", "evaluation_metadata": {}}, "LIDIA-HESSEN/TexPrax": {"name": "LIDIA-HESSEN/TexPrax", "description": "This dataset was collected in the [TexPrax](https://texprax.de/) project and contains named entities annotated by three researchers as well as annotated sentences (problem/P, cause/C, solution/S, and other/O).", "evaluation_metadata": {}}, "rcds/swiss_criticality_prediction": {"name": "rcds/swiss_criticality_prediction", "description": "This dataset contains Swiss federal court decisions for the legal criticality prediction task", "evaluation_metadata": {}}, "calistacxy/imda-dataset": {"name": "calistacxy/imda-dataset", "description": "The National Speech Corpus (NSC) is the first large-scale Singapore English corpus\nspearheaded by the Info-communications and Media Development Authority (IMDA) of Singapore.", "evaluation_metadata": {}}, "Yulong-W/squadori": {"name": "Yulong-W/squadori", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "Yulong-W/squadpara": {"name": "Yulong-W/squadpara", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "rjjan/reuters21578": {"name": "rjjan/reuters21578", "description": "The Reuters-21578 dataset is one of the most widely used data collections for text\ncategorization research. It is collected from the Reuters financial newswire service in 1987.", "evaluation_metadata": {}}, "nanakonoda/xnli_parallel": {"name": "nanakonoda/xnli_parallel", "description": "This dataset was taken from XNLI for a binary text classification task. It has been parallelized in English, German, and French.", "evaluation_metadata": {}}, "Jane016/whisper2": {"name": "Jane016/whisper2", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "IES-Rafael-Alberti/letras-carnaval-cadiz": {"name": "IES-Rafael-Alberti/letras-carnaval-cadiz", "description": "This dataset is a comprehensive collection of lyrics from the Carnaval de C\u00e1diz, a significant cultural heritage of the city of C\u00e1diz, Spain. Despite its cultural importance, there has been a lack of a structured database for these lyrics, hindering research and public access to this cultural heritage. This dataset aims to address this gap.\n\nThe dataset was created by the C\u00e1diz AI Learning Community, a branch of the non-profit association Spain AI, and was developed by Iv\u00e1n Romero Reyna and Jes\u00fas Federico Franco Medinilla, students of the Specialization Course in Artificial Intelligence and Big Data at IES Rafael Alberti during the 2022-2023 academic year. The project is supervised by Jes\u00fas Carlos Avecilla de la Herr\u00e1n, a computational linguist.\n\nCollaboration is encouraged, with individuals able to verify the different records of the dataset at letrascarnavalcadiz.com, ensuring the transcription of the lyrics and all data are correct. New lyrics can also be added to the dataset. Corrections and additions are not immediately reflected in the dataset but are updated periodically.\n\nFor more information or to report a problem, you can write to contacto@letrascarnavalcadiz.com.", "evaluation_metadata": {}}, "davebulaval/RISCBAC": {"name": "davebulaval/RISCBAC", "description": "RISCBAC was created using [RISC](https://github.com/GRAAL-Research/risc), an open-source Python package data \ngenerator. RISC generates look-alike automobile insurance contracts based on the Quebec regulatory insurance \nform in French and English.\n\nIt contains 10,000 English and French insurance contracts generated using the same seed. Thus, contracts share \nthe same deterministic synthetic data (RISCBAC can be used as an aligned dataset). RISC can be used to generate \nmore data for RISCBAC.", "evaluation_metadata": {}}, "koutch/intro_prog": {"name": "koutch/intro_prog", "description": "The Dublin programming dataset is a dataset composed of students' submissions \nto introductory programming assignments at the University of Dublin. \nStudents submitted these programs for multiple programming courses over the duration of three academic years.", "evaluation_metadata": {}}, "harpomaxx/dga-detection": {"name": "harpomaxx/dga-detection", "description": "A dataset containing both DGA and normal domain names. The normal domain names were taken from the Alexa top one million domains. An additional 3,161 normal \ndomains were included in the dataset, provided by the Bambenek Consulting feed. This later group is particularly interesting since it consists of suspicious domain \nnames that were not generated by DGA. Therefore, the total amount of domains normal in the dataset is 1,003,161. DGA domains were obtained from the repositories \nof DGA domains of Andrey Abakumov and John Bambenek. The total amount of DGA domains is 1,915,335, and they correspond to 51 different malware families. DGA domains \nwere generated by 51 different malware families. About the 55% of of the DGA portion of dataset is composed of samples from the Banjori, Post, Timba, Cryptolocker, \nRamdo and Conficker malware.", "evaluation_metadata": {}}, "WxWx/ChatGPT-Detector-Bias": {"name": "WxWx/ChatGPT-Detector-Bias", "description": "The data folders contain the human-written and AI-generated datasets used in our study. Each subfolder contains a name.json file, which provides the metadata, and a data.json file, which contains the text samples.", "evaluation_metadata": {}}, "learningmachineaz/translate_enaz_10m": {"name": "learningmachineaz/translate_enaz_10m", "description": "Machine translation EN-AZ dataset based on Google Translate and National Library of Azerbaijan.", "evaluation_metadata": {}}, "mlengineer-ai/jomleh": {"name": "mlengineer-ai/jomleh", "description": "Jomleh is a Farsi (Persian) monolingual dataset composed of one sentence per sample. It's focused on quality over quantity and it's curated mostly based on the OSCAR project (https://oscar-project.com) among other data sources.\\", "evaluation_metadata": {}}, "Tylersuard/PathfinderX2": {"name": "Tylersuard/PathfinderX2", "description": "The rapid progress of large language models has led to impressive results in a wide array of tasks. However, there remains a need for increasingly challenging datasets to evaluate these models' ability to handle long-range dependencies. In this paper, we present Pathfinder-X2, a novel dataset that builds upon the Pathfinder and Pathfinder-X datasets. Pathfinder-X2 comprises 512x512 pixel images, designed to test large language models' capacity to segment a specific white line dash \"snake\" with a circle at its tip among a collection of similar, distractor snakes. The increased image resolution and complexity of Pathfinder-X2 present a substantially more challenging task for large language models, contributing to the ongoing development and assessment of such models.", "evaluation_metadata": {}}, "hpprc/jsick": {"name": "hpprc/jsick", "description": "Japanese Sentences Involving Compositional Knowledge (JSICK) Dataset.\nJSICK is the Japanese NLI and STS dataset by manually translating the English dataset SICK (Marelli et al., 2014) into Japanese.\nWe hope that our dataset will be useful in research for realizing more advanced models that are capable of appropriately performing multilingual compositional inference.\n(from official website)", "evaluation_metadata": {}}, "NTU-NLP-sg/xCodeEval": {"name": "NTU-NLP-sg/xCodeEval", "description": "The ability to solve problems is a hallmark of intelligence and has been an enduring goal in AI. AI systems that can create programs as solutions to problems or assist developers in writing programs can increase productivity and make programming more accessible. Recently, pre-trained large language models have shown impressive abilities in generating new codes from natural language descriptions, repairing buggy codes, translating codes between languages, and retrieving relevant code segments. However, the evaluation of these models has often been performed in a scattered way on only one or two specific tasks, in a few languages, at a partial granularity (e.g., function) level and in many cases without proper training data. Even more concerning is that in most cases the evaluation of generated codes has been done in terms of mere lexical overlap rather than actual execution whereas semantic similarity (or equivalence) of two code segments depends only on their ``execution similarity'', i.e., being able to get the same output for a given input.", "evaluation_metadata": {}}, "the-coorporation/the_squad_qg": {"name": "the-coorporation/the_squad_qg", "description": "A preprocessed version of the Standford Question Answering Dataset (SQuAD) version 2.0 consisting of contexts and questions only.\n\nDuplicate contexts have been removed and corresponding questions have been merged into an array per context.\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. \nSQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD 2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering.", "evaluation_metadata": {}}, "zaemyung/IteraTeR_plus": {"name": "zaemyung/IteraTeR_plus", "description": "This is the IteraTeR+ dataset used to train the DElIteraTeR system described in the paper: \"Improving Iterative Text Revision by Learning Where to Edit from Other Revision Tasks.\"\nThe dataset is divided into two parts, `single_sent` and `multi_sent`.\nThe former consists of text revision changes for a single sentence of interest, while the latter adds (where applicable) a previous and a following sentences in addition to the sentence of interest.\nIn the `multi_sent` case, the sentence of interest is marked by `` and `` tokens to differentiate from the contextual sentences.\nEach line of a dataset is a JSON line with two keys: `before_sent` and `after_sent`.\nThe value of `before_sent` is text before the revision where the corresponding parts (or spans) to be revised is marked by edit intents.\nThe value of `after_sent` is the revised plain text.", "evaluation_metadata": {}}, "victorcosta/ria_pt__proems_format": {"name": "victorcosta/ria_pt__proems_format", "description": "Dataset for researching the application of data-centric NLP techniques on Portuguese legislation.", "evaluation_metadata": {}}, "xiaojuan0920/CSKG": {"name": "xiaojuan0920/CSKG", "description": "CSKG is a commonsense knowledge graph that combines seven popular sources into a consolidated representation: ATOMIC, ConceptNet, FrameNet, Roget, Visual Genome, Wikidata (We use the Wikidata-CS subset), and WordNet. CSKG is represented as a hyper-relational graph, by using the KGTK data model and file specification. Its creation is entirely supported by KGTK operations.", "evaluation_metadata": {}}, "nanakonoda/xnli_cm": {"name": "nanakonoda/xnli_cm", "description": "This dataset was generated from XNLI using the CodeMixed Text Generator for a binary text classification task.", "evaluation_metadata": {}}, "EdwardLin2023/AESDD": {"name": "EdwardLin2023/AESDD", "description": "AESDD v1.0 was created on October 2017 in the Laboratory of Electronic Media, \nSchool of Journalism and Mass Communications, Aristotle University of Thessaloniki, \nfor the needs of Speech Emotion Recognition research of the Multidisciplinary Media & \nMediated Communication Research Group (M3C, http://m3c.web.auth.gr/).\n\nFor the creation of v.1 of the database, 5 (3 female and 2 male) professional actors were recorded. \n19 utterances of ambiguous out of context emotional content were chosen. \nThe actors acted these 19 utterances in every one of the 5 chosen emotions. \nOne extra improvised utterance was added for every actor and emotion. \nThe guidance of the actors and the choice of the final recordings were supervised by \na scientific expert in dramatology. For some of the utterances, more that one takes were qualified. \nConsequently, around 500 utterances occured in the final database.", "evaluation_metadata": {}}, "thewall/jolma_split": {"name": "thewall/jolma_split", "description": "PRJEB3289\nhttps://www.ebi.ac.uk/ena/browser/view/PRJEB3289\nData that has been generated by HT-SELEX experiments (see Jolma et al. 2010. PMID: 20378718 for description of method) that has been now used to generate transcription factor binding specificity models for most of the high confidence human transcription factors. Sequence data is composed of reads generated with Illumina Genome Analyzer IIX and HiSeq2000 instruments. Samples are composed of single read sequencing of synthetic DNA fragments with a fixed length randomized region or samples derived from such a initial library by selection with a sequence specific DNA binding protein. Originally multiple samples with different \"barcode\" tag sequences were run on the same Illumina sequencing lane but the released files have been already de-multiplexed, and the constant regions and \"barcodes\" of each sequence have been cut out of the sequencing reads to facilitate the use of data. Some of the files are composed of reads from multiple different sequencing lanes and due to this each of the names of the individual reads have been edited to show the flowcell and lane that was used to generate it. Barcodes and oligonucleotide designs are indicated in the names of individual entries. Depending of the selection ligand design, the sequences in each of these fastq-files are either 14, 20, 30 or 40 bases long and had different flanking regions in both sides of the sequence. Each run entry is named in either of the following ways: Example 1) \"BCL6B_DBD_AC_TGCGGG20NGA_1\", where name is composed of following fields ProteinName_CloneType_Batch_BarcodeDesign_SelectionCycle. This experiment used barcode ligand TGCGGG20NGA, where both of the variable flanking constant regions are indicated as they were on the original sequence-reads. This ligand has been selected for one round of HT-SELEX using recombinant protein that contained the DNA binding domain of human transcription factor BCL6B. It also tells that the experiment was performed on batch of experiments named as \"AC\". Example 2) 0_TGCGGG20NGA_0 where name is composed of (zero)_BarcodeDesign_(zero) These sequences have been generated from sequencing of the initial non-selected pool. Same initial pools have been used in multiple experiments that were on different batches, thus for example this background sequence pool is the shared background for all of the following samples. BCL6B_DBD_AC_TGCGGG20NGA_1, ZNF784_full_AE_TGCGGG20NGA_3, DLX6_DBD_Y_TGCGGG20NGA_4 and MSX2_DBD_W_TGCGGG20NGA_2", "evaluation_metadata": {}}, "jquave/e": {"name": "jquave/e", "description": "An open-source replication of E", "evaluation_metadata": {}}, "jquave/e_smol": {"name": "jquave/e_smol", "description": "An open-source replication of E smol", "evaluation_metadata": {}}, "jquave/e_micro": {"name": "jquave/e_micro", "description": "An open-source replication of E micro", "evaluation_metadata": {}}, "nanakonoda/xnli_cm_sample": {"name": "nanakonoda/xnli_cm_sample", "description": "This dataset was generated from XNLI using the CodeMixed Text Generator for a binary text classification task.", "evaluation_metadata": {}}, "NicolaiSivesind/human-vs-machine": {"name": "NicolaiSivesind/human-vs-machine", "description": "This dataset contains labeled data with human-produced and machine-generated texts based on various \ndomains: Wikipedia introductions and academic articles.", "evaluation_metadata": {}}, "may-ohta/iwslt14": {"name": "may-ohta/iwslt14", "description": "The IWSLT 2014 Evaluation Campaign includes the MT track on TED Talks. In this edition, the official language pairs are five:\n\n from English to French\n from English to German\n from German to English\n from English to Italian\n from Italian to English\n\nOptional tasks are proposed with English paired in both directions with other twelve languages:\n\n from/to English to/from Arabic, Spanish, Farsi, Hebrew, Dutch, Polish, Portuguese-Brazil, Romanian, Russian, Slovenian, Turkish and Chinese\n\nSubmitted runs on additional pairs will be evaluated as well, in the hope to stimulate the MT community to evaluate systems on common benchmarks and to share achievements on challenging translation tasks.", "evaluation_metadata": {}}, "may-ohta/kftt": {"name": "may-ohta/kftt", "description": "The Kyoto Free Translation Task is a task for Japanese-English translation that focuses\non Wikipedia articles related to Kyoto. The data used was originally prepared by the\nNational Institute for Information and Communication Technology (NICT) and released as\nthe Japanese-English Bilingual Corpus of Wikipedia's Kyoto Articles (we are simply using\nthe data, NICT does not specifically endorse or sponsor this task).", "evaluation_metadata": {}}, "joelito/MultiLegalPile_Chunks_4000": {"name": "joelito/MultiLegalPile_Chunks_4000", "description": "A chunked version of the MultiLegalPile dataset.", "evaluation_metadata": {}}, "pheepa/jira-commentaries-mlm": {"name": "pheepa/jira-commentaries-mlm", "description": "Dataset of jira comments from different projects of Apache and more.", "evaluation_metadata": {}}, "harish03/catbreed": {"name": "harish03/catbreed", "description": "Demo dataset for testing or showing image-classification capabilities.", "evaluation_metadata": {}}, "BAAI/COIG": {"name": "BAAI/COIG", "description": "We propose the Chinese Open Instruction Generalist (COIG) project to maintain a harmless, helpful, and diverse set of Chinese instruction corpora. We welcome all researchers in the community to contribute to the corpus set and collaborate with us. We only release the first chip of COIG to help the Chinese LLMs' development in the exploration stage and appeal to more researchers joining us in building COIG. We introduce a manually verified translated general instruction corpus, a manually annotated exam instruction corpus, a human value alignment instruction corpus, a multi-round counterfactual correction chat corpus, and a leetcode instruction corpus. We provide these new instruction corpora to assist the community with instruction tuning on Chinese LLMs. These instruction corpora are also template workflows for how new Chinese instruction corpora can be built and expanded effectively.", "evaluation_metadata": {}}, "togethercomputer/RedPajama-Data-1T-Sample": {"name": "togethercomputer/RedPajama-Data-1T-Sample", "description": "RedPajama is a clean-room, fully open-source implementation of the LLaMa dataset. This is a 1B-token sample of the full dataset.", "evaluation_metadata": {}}, "togethercomputer/RedPajama-Data-1T": {"name": "togethercomputer/RedPajama-Data-1T", "description": "RedPajama is a clean-room, fully open-source implementation of the LLaMa dataset.", "evaluation_metadata": {}}, "webis/Touche23-ValueEval": {"name": "webis/Touche23-ValueEval", "description": "Dataset for Touch\\u00E9 / SemEval 2023 Task 4; ValueEval: Identification of Human Values behind Arguments:\nhttps://www.overleaf.com/6679855346wrdckzkdccxg\nBased on the original Webis-ArgValues-22 (https://doi.org/10.5281/zenodo.5657249) dataset accompanying the paper\nIdentifying the Human Values behind Arguments (Kiesel et al. 2022b; https://webis.de/publications.html#kiesel_2022b),\npublished at ACL'22.", "evaluation_metadata": {}}, "jiacheng-ye/logiqa-zh": {"name": "jiacheng-ye/logiqa-zh", "description": "LogiQA is constructed from the logical comprehension problems from publically available questions of the National Civil Servants Examination of China, which is designed to test the civil servant candidates\u2019 critical thinking and problem-solving. This dataset includes the Chinese versions only", "evaluation_metadata": {}}, "jiacheng-ye/nl2bash": {"name": "jiacheng-ye/nl2bash", "description": "The dataset is constructed from\nhttps://github.com/TellinaTool/nl2bash", "evaluation_metadata": {}}, "pheepa/ami-summary": {"name": "pheepa/ami-summary", "description": "Dataset of meeting transcriptions and summaries of AMI courpus.", "evaluation_metadata": {}}, "renumics/cifar10-enriched": {"name": "renumics/cifar10-enriched", "description": "The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images\nper class. There are 50000 training images and 10000 test images.\nThis version if CIFAR-10 is enriched with several metadata such as embeddings, baseline results and label error scores.", "evaluation_metadata": {}}, "HuggingFaceM4/cm4-synthetic-testing-with-embeddings": {"name": "HuggingFaceM4/cm4-synthetic-testing-with-embeddings", "description": "This dataset is designed to be used in testing. It's derived from cm4-10k dataset", "evaluation_metadata": {}}, "lang-uk/every_prompt": {"name": "lang-uk/every_prompt", "description": "Every prompt dataset.\nEvery Prompt is a data-driven approach to mining instructions from the web.\nIt contains over a million FAQs and HowTos from around the world in a structured format.\nIt also has basic pre-processing to calculate the length of the useful text and identify the language of that text with the help of GCLD3", "evaluation_metadata": {}}, "HuggingFaceM4/general-pmd-synthetic-testing-with-embeddings": {"name": "HuggingFaceM4/general-pmd-synthetic-testing-with-embeddings", "description": "This dataset is designed to be used in testing. It's derived from general-pmd-10k dataset", "evaluation_metadata": {}}, "lighteval/MATH": {"name": "lighteval/MATH", "description": "MATH is a dataset of 12,500 challenging competition mathematics problems. Each\nproblem in Math has a full step-by-step solution which can be used to teach\nmodels to generate answer derivations and explanations.", "evaluation_metadata": {}}, "kenjiqq/imagereward-evaluation": {"name": "kenjiqq/imagereward-evaluation", "description": "Dataset to perform Aesthetic evaluation from Table 2 in ImageReward: Learning and Evaluating Human Preferences for Text-to-Image Generation.\n\n@misc{xu2023imagereward,\n title={ImageReward: Learning and Evaluating Human Preferences for Text-to-Image Generation}, \n author={Jiazheng Xu and Xiao Liu and Yuchen Wu and Yuxuan Tong and Qinkai Li and Ming Ding and Jie Tang and Yuxiao Dong},\n year={2023},\n eprint={2304.05977},\n archivePrefix={arXiv},\n primaryClass={cs.CV}\n}", "evaluation_metadata": {}}, "roemmele/ablit": {"name": "roemmele/ablit", "description": "This dataset contains abridged versions of 10 classic English literature books,\naligned with their original versions on various passage levels.The abridgements were written and made publically available by Emma Laybourn: http://www.englishliteratureebooks.com/classicnovelsabridged.html.This is the first known dataset for NLP research that focuses on the abridgement task.", "evaluation_metadata": {}}, "martingrzzler/kanjis2radicals": {"name": "martingrzzler/kanjis2radicals", "description": "Contains Kanji images with corresponding radicals ids from WaniKani or https://api.robanohashi.org/docs/index.html", "evaluation_metadata": {}}, "masakhane/masakhanews": {"name": "masakhane/masakhanews", "description": "MasakhaNEWS is the largest publicly available dataset for news topic classification in 16 languages widely spoken in Africa.\n\nThe languages are:\n- Amharic (amh)\n- English (eng)\n- French (fra)\n- Hausa (hau)\n- Igbo (ibo)\n- Lingala (lin)\n- Luganda (lug)\n- Oromo (orm)\n- Nigerian Pidgin (pcm)\n- Rundi (run)\n- chShona (sna)\n- Somali (som)\n- Kiswahili (sw\u0105)\n- Tigrinya (tir)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n\nThe train/validation/test sets are available for all the 16 languages.\n\nFor more details see *** arXiv link **", "evaluation_metadata": {}}, "renumics/speech_commands_enriched": {"name": "renumics/speech_commands_enriched", "description": "This is a set of one-second .wav audio files, each containing a single spoken\nEnglish word or background noise. These words are from a small set of commands, and are spoken by a\nvariety of different speakers. This data set is designed to help train simple\nmachine learning models. This dataset is covered in more detail at\n[https://arxiv.org/abs/1804.03209](https://arxiv.org/abs/1804.03209).\n\nVersion 0.01 of the data set (configuration `\"v0.01\"`) was released on August 3rd 2017 and contains\n64,727 audio files.\n\nIn version 0.01 thirty different words were recoded: \"Yes\", \"No\", \"Up\", \"Down\", \"Left\",\n\"Right\", \"On\", \"Off\", \"Stop\", \"Go\", \"Zero\", \"One\", \"Two\", \"Three\", \"Four\", \"Five\", \"Six\", \"Seven\", \"Eight\", \"Nine\",\n\"Bed\", \"Bird\", \"Cat\", \"Dog\", \"Happy\", \"House\", \"Marvin\", \"Sheila\", \"Tree\", \"Wow\".\n\n\nIn version 0.02 more words were added: \"Backward\", \"Forward\", \"Follow\", \"Learn\", \"Visual\".\n\nIn both versions, ten of them are used as commands by convention: \"Yes\", \"No\", \"Up\", \"Down\", \"Left\",\n\"Right\", \"On\", \"Off\", \"Stop\", \"Go\". Other words are considered to be auxiliary (in current implementation\nit is marked by `True` value of `\"is_unknown\"` feature). Their function is to teach a model to distinguish core words\nfrom unrecognized ones.\n\nThe `_silence_` class contains a set of longer audio clips that are either recordings or\na mathematical simulation of noise.", "evaluation_metadata": {}}, "renumics/cifar100-enriched": {"name": "renumics/cifar100-enriched", "description": "The CIFAR-100 dataset consists of 60000 32x32 colour images in 100 classes, with 600 images\nper class. There are 500 training images and 100 testing images per class. There are 50000 training images and 10000 test images. The 100 classes are grouped into 20 superclasses.\nThere are two labels per image - fine label (actual class) and coarse label (superclass).", "evaluation_metadata": {}}, "martingrzzler/radicals": {"name": "martingrzzler/radicals", "description": "Contains radical images with radicals ids from WaniKani or https://api.robanohashi.org/docs/index.html", "evaluation_metadata": {}}, "sbmaruf/forai_ml-ted_talk_iwslt": {"name": "sbmaruf/forai_ml-ted_talk_iwslt", "description": "The core of WIT3 is the TED Talks corpus, that basically redistributes the original content published by the TED Conference website (http://www.ted.com). Since 2007,\nthe TED Conference, based in California, has been posting all video recordings of its talks together with subtitles in English\nand their translations in more than 80 languages. Aside from its cultural and social relevance, this content, which is published under the Creative Commons BYNC-ND license, also represents a precious\nlanguage resource for the machine translation research community, thanks to its size, variety of topics, and covered languages.\nThis effort repurposes the original content in a way which is more convenient for machine translation researchers.", "evaluation_metadata": {}}, "sbmaruf/forai_ml_masakhane_mafand": {"name": "sbmaruf/forai_ml_masakhane_mafand", "description": "MAFAND-MT is the largest MT benchmark for African languages in the news domain, covering 21 languages. The languages covered are:\n- Amharic\n- Bambara\n- Ghomala\n- Ewe\n- Fon\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Mossi\n- Nigerian-Pidgin\n- Chichewa\n- Shona\n- Swahili\n- Setswana\n- Twi\n- Wolof\n- Xhosa\n- Yoruba\n- Zulu\nThe train/validation/test sets are available for 16 languages, and validation/test set for amh, kin, nya, sna, and xho\nFor more details see https://aclanthology.org/2022.naacl-main.223/", "evaluation_metadata": {}}, "masakhane/afriqa": {"name": "masakhane/afriqa", "description": "AfriQA: Cross-lingual Open-Retrieval Question Answering for African Languages\n\nAfriQA is the first cross-lingual question answering (QA) dataset with a focus on African languages. \nThe dataset includes over 12,000 XOR QA examples across 10 African languages, making it an invaluable resource for developing more equitable QA technology.", "evaluation_metadata": {}}, "PORTULAN/glue-ptpt": {"name": "PORTULAN/glue-ptpt", "description": "GLUE-PTPT is an European Portuguese translation of the GLUE benchmark using DeepL Pro.", "evaluation_metadata": {}}, "metaeval/xnli": {"name": "metaeval/xnli", "description": "XNLI is a subset of a few thousand examples from MNLI which has been translated\ninto a 14 different languages (some low-ish resource). As with MNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).", "evaluation_metadata": {}}, "may-ohta/tatoeba": {"name": "may-ohta/tatoeba", "description": "This is a collection of translated sentences from Tatoeba\n359 languages, 3,403 bitexts\ntotal number of files: 750\ntotal number of tokens: 65.54M\ntotal number of sentence fragments: 8.96M", "evaluation_metadata": {}}, "vmalperovich/QC": {"name": "vmalperovich/QC", "description": "This data collection contains all the data used in our learning question classification experiments(see [1]), which has question class definitions, the training and testing question sets, examples of preprocessing the questions, feature definition scripts and examples of semantically related word features. \nThis work has been done by Xin Li and Dan Roth and supported by [2].", "evaluation_metadata": {}}, "norabelrose/truthful_qa": {"name": "norabelrose/truthful_qa", "description": "TruthfulQA is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.", "evaluation_metadata": {}}, "matejklemen/clc_fce": {"name": "matejklemen/clc_fce", "description": "The CLC FCE Dataset is a set of 1,244 exam scripts written by candidates sitting the Cambridge ESOL First Certificate \nin English (FCE) examination in 2000 and 2001. The dataset exposes the sentence-level pre-tokenized M2 version, totaling \n33236 sentences.", "evaluation_metadata": {}}, "matejklemen/wi_locness": {"name": "matejklemen/wi_locness", "description": "Write & Improve is an online web platform that assists non-native English students with their writing. Specifically, students from around the world submit letters, stories, articles and essays in response to various prompts, and the W&I system provides instant feedback. Since W&I went live in 2014, W&I annotators have manually annotated some of these submissions and assigned them a CEFR level.\nThe LOCNESS corpus consists of essays written by native English students. It was originally compiled by researchers at the Centre for English Corpus Linguistics at the University of Louvain. Since native English students also sometimes make mistakes, we asked the W&I annotators to annotate a subsection of LOCNESS so researchers can test the effectiveness of their systems on the full range of English levels and abilities.", "evaluation_metadata": {}}, "matejklemen/nucle": {"name": "matejklemen/nucle", "description": "The National University of Singapore Corpus of Learner English (NUCLE) consists of 1,400 essays written by mainly Asian undergraduate students at the National University of Singapore", "evaluation_metadata": {}}, "Dr-BERT/QUAERO": {"name": "Dr-BERT/QUAERO", "description": "The QUAERO French Medical Corpus has been initially developed as a resource for named entity recognition and normalization [1]. It was then improved with the purpose of creating a gold standard set of normalized entities for French biomedical text, that was used in the CLEF eHealth evaluation lab [2][3].\nA selection of MEDLINE titles and EMEA documents were manually annotated. The annotation process was guided by concepts in the Unified Medical Language System (UMLS):\n1. Ten types of clinical entities, as defined by the following UMLS Semantic Groups (Bodenreider and McCray 2003) were annotated: Anatomy, Chemical and Drugs, Devices, Disorders, Geographic Areas, Living Beings, Objects, Phenomena, Physiology, Procedures.\n2. The annotations were made in a comprehensive fashion, so that nested entities were marked, and entities could be mapped to more than one UMLS concept. In particular: (a) If a mention can refer to more than one Semantic Group, all the relevant Semantic Groups should be annotated. For instance, the mention \u201cr\u00e9cidive\u201d (recurrence) in the phrase \u201cpr\u00e9vention des r\u00e9cidives\u201d (recurrence prevention) should be annotated with the category \u201cDISORDER\u201d (CUI C2825055) and the category \u201cPHENOMENON\u201d (CUI C0034897); (b) If a mention can refer to more than one UMLS concept within the same Semantic Group, all the relevant concepts should be annotated. For instance, the mention \u201cmaniaques\u201d (obsessive) in the phrase \u201cpatients maniaques\u201d (obsessive patients) should be annotated with CUIs C0564408 and C0338831 (category \u201cDISORDER\u201d); (c) Entities which span overlaps with that of another entity should still be annotated. For instance, in the phrase \u201cinfarctus du myocarde\u201d (myocardial infarction), the mention \u201cmyocarde\u201d (myocardium) should be annotated with category \u201cANATOMY\u201d (CUI C0027061) and the mention \u201cinfarctus du myocarde\u201d should be annotated with category \u201cDISORDER\u201d (CUI C0027051)\nThe QUAERO French Medical Corpus BioC release comprises a subset of the QUAERO French Medical corpus, as follows:\nTraining data (BRAT version used in CLEF eHealth 2015 task 1b as training data): \n- MEDLINE_train_bioc file: 833 MEDLINE titles, annotated with normalized entities in the BioC format \n- EMEA_train_bioc file: 3 EMEA documents, segmented into 11 sub-documents, annotated with normalized entities in the BioC format \nDevelopment data (BRAT version used in CLEF eHealth 2015 task 1b as test data and in CLEF eHealth 2016 task 2 as development data): \n- MEDLINE_dev_bioc file: 832 MEDLINE titles, annotated with normalized entities in the BioC format\n- EMEA_dev_bioc file: 3 EMEA documents, segmented into 12 sub-documents, annotated with normalized entities in the BioC format \nTest data (BRAT version used in CLEF eHealth 2016 task 2 as test data): \n- MEDLINE_test_bioc folder: 833 MEDLINE titles, annotated with normalized entities in the BioC format \n- EMEA folder_test_bioc: 4 EMEA documents, segmented into 15 sub-documents, annotated with normalized entities in the BioC format \nThis release of the QUAERO French medical corpus, BioC version, comes in the BioC format, through automatic conversion from the original BRAT format obtained with the Brat2BioC tool https://bitbucket.org/nicta_biomed/brat2bioc developped by Jimeno Yepes et al.\nAntonio Jimeno Yepes, Mariana Neves, Karin Verspoor \nBrat2BioC: conversion tool between brat and BioC\nBioCreative IV track 1 - BioC: The BioCreative Interoperability Initiative, 2013\nPlease note that the original version of the QUAERO corpus distributed in the CLEF eHealth challenge 2015 and 2016 came in the BRAT stand alone format. It was distributed with the CLEF eHealth evaluation tool. This original distribution of the QUAERO French Medical corpus is available separately from https://quaerofrenchmed.limsi.fr \nAll questions regarding the task or data should be addressed to aurelie.neveol@limsi.fr", "evaluation_metadata": {}}, "EdwardLin2023/MELD_Audio_3Labels": {"name": "EdwardLin2023/MELD_Audio_3Labels", "description": "Multimodal EmotionLines Dataset (MELD) has been created by enhancing and extending EmotionLines dataset. \nMELD contains the same dialogue instances available in EmotionLines, but it also encompasses audio and \nvisual modality along with text. MELD has more than 1400 dialogues and 13000 utterances from Friends TV series. \nMultiple speakers participated in the dialogues. Each utterance in a dialogue has been labeled by any of these \nseven emotions -- Anger, Disgust, Sadness, Joy, Neutral, Surprise and Fear. MELD also has sentiment (positive, \nnegative and neutral) annotation for each utterance.\n\nThis dataset is slightly modified, so that it concentrates on Emotion recognition in audio input only.", "evaluation_metadata": {}}, "lighteval/pile": {"name": "lighteval/pile", "description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.", "evaluation_metadata": {}}, "StampyAI/alignment-research-dataset": {"name": "StampyAI/alignment-research-dataset", "description": "The AI Alignment Research Dataset is a collection of documents related to AI Alignment and Safety from various books, research papers, and alignment related blog posts.", "evaluation_metadata": {}}, "Saads/birdsdataset": {"name": "Saads/birdsdataset", "description": "VIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for\nVietnamese Automatic Speech Recognition task.\nThe corpus was prepared by AILAB, a computer science lab of VNUHCM - University of Science, with Prof. Vu Hai Quan is the head of.\nWe publish this corpus in hope to attract more scientists to solve Vietnamese speech recognition problems.", "evaluation_metadata": {}}, "TrainingDataPro/selfies_and_id": {"name": "TrainingDataPro/selfies_and_id", "description": "4083 sets, which includes 2 photos of a person from his documents and\n13 selfies. 571 sets of Hispanics and 3512 sets of Caucasians.\nPhoto documents contains only a photo of a person.\nAll personal information from the document is hidden.", "evaluation_metadata": {}}, "TrainingDataPro/anti-spoofing_replay": {"name": "TrainingDataPro/anti-spoofing_replay", "description": "The dataset consists of 40,000 videos and selfies with unique people. 15,000\nattack replays from 4,000 unique devices. 10,000 attacks with A4 printouts and\n10,000 attacks with cut-out printouts.", "evaluation_metadata": {}}, "TrainingDataPro/selfie_and_video": {"name": "TrainingDataPro/selfie_and_video", "description": "4000 people in this dataset. Each person took a selfie on a webcam,\ntook a selfie on a mobile phone. In addition, people recorded video from\nthe phone and from the webcam, on which they pronounced a given set of numbers.\nIncludes folders corresponding to people in the dataset. Each folder includes\n8 files (4 images and 4 videos).", "evaluation_metadata": {}}, "TrainingDataPro/face_masks": {"name": "TrainingDataPro/face_masks", "description": "Dataset includes 250 000 images, 4 types of mask worn on 28 000 unique faces.\nAll images were collected using the Toloka.ai crowdsourcing service and\nvalidated by TrainingData.pro", "evaluation_metadata": {}}, "TrainingDataPro/portrait_and_26_photos": {"name": "TrainingDataPro/portrait_and_26_photos", "description": "Each set includes 27 photos of people. Each person provided\ntwo types of photos: one photo in profile (portrait_1),\nand 26 photos from their life (photo_1, photo_2, \u2026, photo_26).", "evaluation_metadata": {}}, "TrainingDataPro/face_segmentation": {"name": "TrainingDataPro/face_segmentation", "description": "An example of a dataset that we've collected for a photo edit App.\nThe dataset includes 20 selfies of people (man and women)\nin segmentation masks and their visualisations.", "evaluation_metadata": {}}, "moyix/asleep_keyboard": {"name": "moyix/asleep_keyboard", "description": "The Asleep at the Keyboard dataset contains 89 code generation scenarios that are designed to test the ability of code generation models to generate code secure code. The dataset is split into three evaluation axes: diversity of weaknesses (DoW), diversity of prompts (DoP), and diversity of domains (DoD).\n\nTo perform this analysis we prompt Copilot to generate code in scenarios relevant to high-risk cybersecurity weaknesses, e.g. those from MITRE\u2019s \u201cTop 25\u201d Common Weakness Enumeration (CWE) list. We explore Copilot\u2019s performance on three distinct code generation axes\u2014examining how it performs given diversity of weaknesses, diversity of prompts, and diversity of domains. In total, we produce 89 different scenarios", "evaluation_metadata": {}}, "Howuhh/nle_hf_dataset": {"name": "Howuhh/nle_hf_dataset", "description": "3 billion state-action-score transitions from 100,000 trajectories collected from the symbolic bot winner of the NetHack Challenge 2021.", "evaluation_metadata": {}}, "EleutherAI/truthful_qa_mc": {"name": "EleutherAI/truthful_qa_mc", "description": "TruthfulQA-MC is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.", "evaluation_metadata": {}}, "KauPage/SVM": {"name": "KauPage/SVM", "description": "A medium-scale marathi speech corpus for representation learning, semi-supervised learning and interpretation focused on Gurudev's sermons.", "evaluation_metadata": {}}, "EleutherAI/truthful_qa_binary": {"name": "EleutherAI/truthful_qa_binary", "description": "TruthfulQA-Binary is a benchmark to measure whether a language model is truthful in\ngenerating answers to questions. The benchmark comprises 817 questions that\nspan 38 categories, including health, law, finance and politics. Questions are\ncrafted so that some humans would answer falsely due to a false belief or\nmisconception. To perform well, models must avoid generating false answers\nlearned from imitating human texts.", "evaluation_metadata": {}}, "cardy/kohatespeech": {"name": "cardy/kohatespeech", "description": "They provide the first human-annotated Korean corpus for toxic speech detection and the large unlabeled corpus.\nThe data is comments from the Korean entertainment news aggregation platform.", "evaluation_metadata": {}}, "george-chou/pianos_mel": {"name": "george-chou/pianos_mel", "description": "pianos_mel is a mel spectrogram dataset of piano sounds. \nIt consists of 8 kinds of pianos_mel including\nPearlRiver, YoungChang, Steinway-T, Hsinghai, Kawai, Steinway, Kawai-G and Yamaha. \nData was annotated by students from the China Conservatory of Music (CCMUSIC) in Beijing.", "evaluation_metadata": {}}, "patriziobellan/PETv11": {"name": "patriziobellan/PETv11", "description": "Abstract. Although there is a long tradition of work in NLP on extracting entities and relations from text, to date there exists little work on the acquisition of business processes from unstructured data such as textual corpora of process descriptions. With this work we aim at filling this gap and establishing the first steps towards bridging data-driven information extraction methodologies from Natural Language Processing and the model-based formalization that is aimed from Business Process Management. For this, we develop the first corpus of business process descriptions annotated with activities, gateways, actors and flow information. We present our new resource, including a detailed overview of the annotation schema and guidelines, as well as a variety of baselines to benchmark the difficulty and challenges of business process extraction from text.", "evaluation_metadata": {}}, "llm-book/JGLUE": {"name": "llm-book/JGLUE", "description": "JGLUE, Japanese General Language Understanding Evaluation, is built to measure the general NLU ability in Japanese. JGLUE has been constructed from scratch without translation. We hope that JGLUE will facilitate NLU research in Japanese.", "evaluation_metadata": {}}, "inseq/scat": {"name": "inseq/scat", "description": "The Supporting Context for Ambiguous Translations corpus (SCAT) is a dataset \nof English-to-French translations annotated with human rationales used for resolving ambiguity \nin pronoun anaphora resolution for multi-sentence translation.", "evaluation_metadata": {}}, "maxardito/beatbox": {"name": "maxardito/beatbox", "description": " Dataset consisting of isolated beatbox samples ,\n reimplementation of the dataset from the following \n paper: BaDumTss: Multi-task Learning for Beatbox Transcription", "evaluation_metadata": {}}, "danbrown/testman-dataset": {"name": "danbrown/testman-dataset", "description": "Demo dataset for testing or showing image-text capabilities.", "evaluation_metadata": {}}, "hoskinson-center/minif2f-lean4": {"name": "hoskinson-center/minif2f-lean4", "description": "A Lean 4 version of minif2f.", "evaluation_metadata": {}}, "feradauto/NLP4SGPapers": {"name": "feradauto/NLP4SGPapers", "description": "NLP4SGPAPERS dataset: a scientific dataset with three associated tasks that can help identify NLP4SG papers", "evaluation_metadata": {}}, "TrainingDataPro/license_plates": {"name": "TrainingDataPro/license_plates", "description": "Over 1.2 million annotated license plates from vehicles around the world.\nThis dataset is tailored for License Plate Recognition tasks and includes\nimages from both YouTube and PlatesMania. \nAnnotation details are provided in the About section below.", "evaluation_metadata": {}}, "george-chou/emopia_mel": {"name": "george-chou/emopia_mel", "description": "While there are many music datasets with emotion labels in the literature, \nthey cannot be used for research on symbolic-domain music analysis or generation, \nas there are usually audio files only. In this paper, \nwe present the EMOPIA (pronounced `yee-mo-pi-uh') dataset, \na shared multi-modal (audio and MIDI) database focusing on perceived emotion in pop piano music, \nto facilitate research on various tasks related to music emotion. \nThe dataset contains 1,087 music clips from 387 songs and clip-level emotion labels annotated by four dedicated annotators. \nSince the clips are not restricted to one clip per song, they can also be used for song-level analysis. \nWe present the methodology for building the dataset, \ncovering the song list curation, clip selection, and emotion annotation processes. \nMoreover, we prototype use cases on clip-level music emotion classification and \nemotion-based symbolic music generation by training and evaluating corresponding models using the dataset. \nThe result demonstrates the potential of EMOPIA for being used in future exploration on piano emotion-related MIR tasks.", "evaluation_metadata": {}}, "meczifho/QuaeroFrenchMed": {"name": "meczifho/QuaeroFrenchMed", "description": "The QUAEROFrenchMed is a manually annotated corpus developed as a resource for named entity named recognition and normalization.", "evaluation_metadata": {}}, "biglam/dating-historical-color-images": {"name": "biglam/dating-historical-color-images", "description": "This dataset contains color photographs taken between the 1930s and 1970s. \nThe goal of the dataset is to develop methods for dating historical color photographs", "evaluation_metadata": {}}, "seanghay/khPOS": {"name": "seanghay/khPOS", "description": "The khPOS Corpus (Khmer POS Corpus) is a 12,000 sentences (25,626 words) manually word segmented and POS tagged corpus developed for Khmer language NLP research and developments. We collected Khmer sentences from websites that include various area such as economics, news, politics. Moreover it is also contained some student list and voter list of national election committee of Cambodia. The average number of words per sentence in the whole corpus is 10.75. Here, some symbols such as \"\u17d4\" (Khmer sign Khan), \"\u17d6\" (Khmer sign Camnuc pii kuuh), \"-\", \"?\", \"[\", \"]\" etc. also counted as words. The shotest sentence contained only 1 word and longest sentence contained 169 words as follows (here, line number : Khmer sentence):\n\n1814 : \" \u1798\u17c9\u17c2 \u17a5\u178f \u1798\u17b6\u1793 \u179f\u17d2\u17a2\u1794\u17cb_\u1781\u17d2\u1796\u17be\u1798 \u17aa\u1796\u17bb\u1780 \u1780\u17bc\u1793 \u17af\u1784 \u1791\u17c1 \u1798\u17c9\u17c2 \u178f\u17c2\u1784 \u1793\u17b9\u1780 \u1798\u1780 \u1780\u17bc\u1793 \u1793\u17b9\u1784 \u17aa\u1796\u17bb\u1780 \u17af\u1784 \u1796\u17bb\u17c6 \u1798\u17b6\u1793 \u1797\u17d2\u179b\u17c1\u1785 \u1796\u17d2\u179a\u1798_\u1791\u17b6\u17c6\u1784 \u17a2\u17d2\u1793\u1780~\u1797\u17bc\u1798\u17b7 \u1795\u1784 \u179a\u1794\u1784 \u1787\u17b6\u1798\u17bd\u1799 \u17af\u1784 \u1791\u17c0\u178f \u178a\u17c2\u179b \u1798\u17d2\u178a\u17b6\u1799 \u1792\u17d2\u179b\u17b6\u1794\u17cb \u1793\u17c5 \u1787\u17b6\u1798\u17bd\u1799 \u1782\u17c1 \u1794\u17c9\u17bb\u1793\u17d2\u178f\u17c2 \u1798\u17c9\u17c2 \u1787\u17b6\u178f\u17b7 \u1787\u17b6 \u1791\u17c1\u1796_\u1792\u17b8\u178f\u17b6 \u1796\u17bb\u17c6 \u17a2\u17b6\u1785 \u1793\u17c5 \u1787\u17b6\u1798\u17bd\u1799 \u1798\u1793\u17bb\u179f\u17d2\u179f_\u179b\u17c4\u1780 \u1794\u17b6\u1793 \u1799\u17bc\u179a \u1791\u17c1 \u179a\u17b6\u179b\u17cb \u1790\u17d2\u1784\u17c3 \u1798\u17c9\u17c2 \u178f\u17c2\u1784 \u1791\u17c5 \u1794\u17c6\u1796\u17c1\u1789 \u1780\u17b7\u1785\u17d2\u1785 \u1793\u17c5 \u1785\u17c6\u1796\u17c4\u17c7 \u1798\u17bb\u1781 \u1796\u17d2\u179a\u17c7~\u1797\u1780\u17d2\u178f\u17d2\u179a \u1796\u17d2\u179a\u17c7~\u17a5\u1793\u17d2\u1791\u17d2\u179a\u17b6\u1792\u17b7\u179a\u17b6\u1787 \u1782\u17ba \u179f\u17bb\u17c6 \u17a2\u1784\u17d2\u179c\u179a \u17b2\u17d2\u1799 \u1796\u17d2\u179a\u17c7~\u17a2\u1784\u17d2\u1782 \u1794\u17d2\u179a\u1791\u17b6\u1793 \u1796\u179a \u178a\u179b\u17cb \u1780\u17bc\u1793 \u17af\u1784 \u1793\u17b7\u1784 \u17aa\u1796\u17bb\u1780 \u1780\u17bc\u1793 \u17af\u1784 \u1780\u17bb\u17c6 \u1794\u17b8 \u1781\u17b6\u1793 \u1798\u17b7\u1793 \u178f\u17c2 \u1794\u17c9\u17bb\u178e\u17d2\u178e\u17c4\u17c7 \u1798\u17d2\u178a\u17b6\u1799 \u1794\u17b6\u1793 \u1791\u17b6\u17c6\u1784 \u1791\u17bc\u179b \u179f\u17bb\u17c6 \u1796\u17d2\u179a\u17c7~\u17a5\u1793\u17d2\u1791\u17d2\u179a \u17b2\u17d2\u1799 \u1796\u17d2\u179a\u17c7~\u17a2\u1784\u17d2\u1782 \u1798\u17c1\u178f\u17d2\u178f\u17b6 \u1795\u17d2\u179f\u17b6\u1799 \u1793\u17bc\u179c \u179f\u17bb\u1797_\u1798\u1784\u17d2\u1782\u179b \u178a\u179b\u17cb \u1798\u1793\u17bb\u179f\u17d2\u179f \u1793\u17c5 \u178b\u17b6\u1793 \u1793\u17c1\u17c7 \u1791\u17bc\u1791\u17c5 \u1795\u1784 \u1780\u17bc\u1793_\u1794\u17d2\u179a\u17bb\u179f \u1796\u1793\u17d2\u179b\u1780 \u1798\u17d2\u178a\u17b6\u1799 ! \u1798\u17d2\u178a\u17b6\u1799 \u1796\u17bb\u17c6 \u17a2\u17b6\u1785 \u1793\u17c5 \u1787\u17b6\u1798\u17bd\u1799_\u1793\u17b9\u1784 \u1780\u17bc\u1793 \u1794\u17b6\u1793 \u1791\u17c0\u178f \u178f\u17c2 \u1798\u17d2\u178a\u17b6\u1799 \u1799\u1780 \u1780\u17bc\u1793 \u17af\u1784 \u1791\u17c5 \u179b\u17c1\u1784 \u1794\u17d2\u179a\u17b6\u179f\u17b6\u1791 \u1798\u17d2\u178a\u17b6\u1799 \u17af \u178b\u17b6\u1793 \u179b\u17be \u1798\u17bd\u1799 \u178a\u1784 \u1798\u17d2\u178a\u17b6\u1799 \u1793\u17b9\u1784 \u1793\u17b6\u17c6 \u1780\u17bc\u1793 \u17af\u1784 \u1791\u17c5 \u1798\u17bb\u1787_\u1791\u17b9\u1780 \u1780\u17d2\u1793\u17bb\u1784 \u17a2\u17b6\u1784 \u1780\u17d2\u179a\u17a2\u17bc\u1794 \u1793\u17c5_\u1780\u17d2\u1793\u17bb\u1784 \u179f\u17bd\u1793 \u1796\u17d2\u179a\u17c7~\u17a5\u1793\u17d2\u1791\u17d2\u179a \u17a0\u17be\u1799 \u1791\u17b9\u1780 \u1793\u17c4\u17c7 \u1793\u17b9\u1784 \u1787\u1798\u17d2\u179a\u17c7 \u1780\u17b6\u1799 \u1780\u17bc\u1793 \u17af\u1784 \u17b2\u17d2\u1799 \u1794\u17b6\u178f\u17cb \u1792\u17c6 \u1780\u17d2\u179b\u17b7\u1793 \u1798\u1793\u17bb\u179f\u17d2\u179f_\u179b\u17c4\u1780 \u1794\u1793\u17d2\u1791\u17b6\u1794\u17cb_\u1796\u17b8 \u1793\u17c4\u17c7 \u1798\u1780 \u1798\u17d2\u178a\u17b6\u1799 \u1793\u17b9\u1784 \u1793\u17b6\u17c6 \u1780\u17bc\u1793 \u17af\u1784 \u1785\u17bc\u179b \u1791\u17c5_\u1780\u17d2\u1793\u17bb\u1784 \u1794\u17d2\u179a\u17b6\u179f\u17b6\u1791 \u179a\u17bd\u1785 \u1793\u17b6\u17c6 \u1780\u17bc\u1793 \u17af\u1784 \u1791\u17c5 \u1790\u17d2\u179c\u17b6\u1799_\u1794\u1784\u17d2\u179a\u17c7~\u17a5\u1793\u17d2\u1791\u17d2\u179a \" \u17d4", "evaluation_metadata": {}}, "llm-book/jawiki-20220404-c400": {"name": "llm-book/jawiki-20220404-c400", "description": "This dataset is used for AIO (AI\u738b), a competition to promote research on question answering systems for the Japanese language. This dataset contains passages, each of which consists of consecutive sentences \nno longer than 400 characters from Japanese Wikipedia as of 2022-04-04.", "evaluation_metadata": {}}, "howey/super_scirep": {"name": "howey/super_scirep", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "howey/super_scirep_test": {"name": "howey/super_scirep_test", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "akozlova/RuFacts": {"name": "akozlova/RuFacts", "description": "Fact-checking benchmark for the Russian Big Language Models.", "evaluation_metadata": {}}, "Fsoft-AIC/the-vault-function": {"name": "Fsoft-AIC/the-vault-function", "description": "The Vault is a multilingual code-text dataset with over 40 million pairs covering 10 popular programming languages. \nIt is the largest corpus containing parallel code-text data. By building upon The Stack, a massive raw code sample collection, \nthe Vault offers a comprehensive and clean resource for advancing research in code understanding and generation. It provides a \nhigh-quality dataset that includes code-text pairs at multiple levels, such as class and inline-level, in addition to the function level. \nThe Vault can serve many purposes at multiple levels.", "evaluation_metadata": {}}, "tomas-gajarsky/cifar100-lt": {"name": "tomas-gajarsky/cifar100-lt", "description": "The CIFAR-100-LT dataset is comprised of under 60,000 color images, each measuring 32x32 pixels, \ndistributed across 100 distinct classes. \nThe number of samples within each class decreases exponentially with factors of 10 and 100. \nThe dataset includes 10,000 test images, with 100 images per class, \nand fewer than 50,000 training images. \nThese 100 classes are further organized into 20 overarching superclasses. \nEach image is assigned two labels: a fine label denoting the specific class, \nand a coarse label representing the associated superclass.", "evaluation_metadata": {}}, "tobiaslee/VEC": {"name": "tobiaslee/VEC", "description": "Visual and Embodied Concept (VEC) benchmark is designed for evaluating the LLM understanding ability of basic visual (color, shape, size, height and material) and embodied (mass, temperature, hardness) concepts.", "evaluation_metadata": {}}, "tomas-gajarsky/cifar10-lt": {"name": "tomas-gajarsky/cifar10-lt", "description": "The CIFAR-10-LT imbalanced dataset is comprised of under 60,000 color images, each measuring 32x32 pixels, \ndistributed across 10 distinct classes. \nThe dataset includes 10,000 test images, with 1000 images per class, \nand fewer than 50,000 training images.\nThe number of samples within each class of the train set decreases exponentially with factors of 10, 50 or 100.", "evaluation_metadata": {}}, "davanstrien/MAMe": {"name": "davanstrien/MAMe", "description": "This dataset contains color photographs taken between the 1930s and 1970s. \nThe goal of the dataset is to develop methods for dating historical color photographs", "evaluation_metadata": {}}, "gsarti/iwslt2017_context": {"name": "gsarti/iwslt2017_context", "description": "The IWSLT 2017 Multilingual Task addresses text translation, including zero-shot translation, with a single MT system across all directions including English, German, Dutch, Italian and Romanian. As unofficial task, conventional bilingual text translation is offered between English and Arabic, French, Japanese, Chinese, German and Korean.", "evaluation_metadata": {}}, "turkish-nlp-suite/beyazperde-top-300-movie-reviews": {"name": "turkish-nlp-suite/beyazperde-top-300-movie-reviews", "description": "[BeyazPerde Top 300 Movie Reviews Dataset](https://github.com/turkish-nlp-suite/BeyazPerde-Movie-Reviews/)", "evaluation_metadata": {}}, "tomaarsen/conll2003": {"name": "tomaarsen/conll2003", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": [{"config": "conll2003", "task": "token-classification", "task_id": "entity_extraction", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "tokens", "ner_tags": "tags"}, "metrics": [{"type": "seqeval", "name": "seqeval"}]}]}, "mainlp/aed_atis": {"name": "mainlp/aed_atis", "description": "This dataset is designed for Annotation Error Detection.", "evaluation_metadata": {}}, "mainlp/aed_conll": {"name": "mainlp/aed_conll", "description": "This dataset is designed for Annotation Error Detection.", "evaluation_metadata": {}}, "mainlp/inconsistencies_companies": {"name": "mainlp/inconsistencies_companies", "description": "This dataset is designed for Annotation Error Detection.", "evaluation_metadata": {}}, "mainlp/inconsistencies_flights": {"name": "mainlp/inconsistencies_flights", "description": "This dataset is designed for Annotation Error Detection.", "evaluation_metadata": {}}, "mainlp/inconsistencies_forex": {"name": "mainlp/inconsistencies_forex", "description": "This dataset is designed for Annotation Error Detection.", "evaluation_metadata": {}}, "mainlp/pervasive_imdb": {"name": "mainlp/pervasive_imdb", "description": "This dataset is designed for Annotation Error Detection.", "evaluation_metadata": {}}, "matejklemen/akces_gec": {"name": "matejklemen/akces_gec", "description": "AKCES-GEC is a grammar error correction corpus for Czech generated from a subset of AKCES resources.", "evaluation_metadata": {}}, "TrainingDataPro/printed_photos_attacks": {"name": "TrainingDataPro/printed_photos_attacks", "description": "The dataset consists of 40,000 videos and selfies with unique people. 15,000\nattack replays from 4,000 unique devices. 10,000 attacks with A4 printouts and\n10,000 attacks with cut-out printouts.", "evaluation_metadata": {}}, "matejklemen/falko_merlin": {"name": "matejklemen/falko_merlin", "description": "Falko-MERLIN is a grammatical error correction corpus consisting of essays and exams.", "evaluation_metadata": {}}, "noorlight/captioned_dataset": {"name": "noorlight/captioned_dataset", "description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.", "evaluation_metadata": {}}, "yuyang/bart_cnndm": {"name": "yuyang/bart_cnndm", "description": "CNN/DailyMail non-anonymized summarization dataset.\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary", "evaluation_metadata": {}}, "juletxara/mgsm": {"name": "juletxara/mgsm", "description": "Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).\n\nThe same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:\n- Spanish\n- French\n- German\n- Russian\n- Chinese\n- Japanese\n- Thai\n- Swahili\n- Bengali\n- Telugu\n\nYou can find the input and targets for each of the ten languages (and English) as `.tsv` files.\nWe also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.", "evaluation_metadata": {}}, "ctu-aic/csfever_v2": {"name": "ctu-aic/csfever_v2", "description": "This new dataset is aimed on Czech fact-checking task.", "evaluation_metadata": {}}, "chan127ck/temp-dataset": {"name": "chan127ck/temp-dataset", "description": "This is a dataset for wines in various regions around the world with names, regions, ratings and descriptions", "evaluation_metadata": {}}, "gneubig/dstc11": {"name": "gneubig/dstc11", "description": "This repository contains data, relevant scripts and baseline code for the Dialog Systems Technology Challenge (DSTC11).", "evaluation_metadata": {}}, "lighteval/lextreme": {"name": "lighteval/lextreme", "description": "The LEXTREME Benchmark is a collection of multilingual datasets for evaluating model performance \nacross a diverse set of legal NLU tasks.", "evaluation_metadata": {}}, "lighteval/lexglue": {"name": "lighteval/lexglue", "description": "Legal General Language Understanding Evaluation (LexGLUE) benchmark is\na collection of datasets for evaluating model performance across a diverse set of legal NLU tasks", "evaluation_metadata": {}}, "c3po-ai/edgar-corpus": {"name": "c3po-ai/edgar-corpus", "description": "The dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).", "evaluation_metadata": {}}, "lexlms/legal_lama": {"name": "lexlms/legal_lama", "description": "LegalLAMA: Legal LAnguage Model Analysis (LAMA) (LAMA) dataset.", "evaluation_metadata": {}}, "bandoos/conll2003-mini": {"name": "bandoos/conll2003-mini", "description": "\n!! forked version producing at most 10 items per split !!\n\nThe shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "evaluation_metadata": {}}, "SeanSleat/lj_speech": {"name": "SeanSleat/lj_speech", "description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading\npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length\nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```", "evaluation_metadata": {}}, "TenzinGayche/Demo-datasets": {"name": "TenzinGayche/Demo-datasets", "description": "Stanford Question Answering Dataset (DemoDatasets) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "lighteval/med_dialog": {"name": "lighteval/med_dialog", "description": "\"The MedDialog dataset (English) contains conversations between doctors and patients.\n It has 0.26 million dialogues. The data is continuously growing and more dialogues will be added.\n The raw dialogues are from healthcaremagic.com and icliniq.com. All copyrights of the data belong\n to healthcaremagic.com and icliniq.com.\"\n\n The following is an example from the healthcaremagic.com subset:\n\n Patient: I get cramps on top of my left forearm and hand and it causes my hand and fingers to draw up and it\n hurts. It mainly does this when I bend my arm. I ve been told that I have a slight pinch in a nerve in my neck.\n Could this be a cause? I don t think so. Doctor: Hi there. It may sound difficult to believe it ,but the nerves\n which supply your forearms and hand, start at the level of spinal cord and on their way towards the forearm and\n hand regions which they supply, the course of these nerves pass through difference fascial and muscular planes\n that can make them susceptible to entrapment neuropathies. Its a group of conditions where a nerve gets\n compressed between a muscle and a bone, or between the fibers of a muscle that it pierces or passes through.\n Also, the compression can happen when the nerves are travelling around a blood vessel which can mechanically put\n pressure on them. Usually patients who would be having such a problem present with a dull aching pain over the\n arm and forearm. If it is not too severe and does not cause any neurological deficits then conservative management\n with Pregabalin and Vitamin B complex tablets, activity modifications and physiotherapy can be started which\n will provide relief. Avoid the activities which exaggerate your problem.\n\n Could painful forearms be related to pinched nerve in neck?\n\n\n The following is an example from the icliniq.com subset:\n\n Patient: Hello doctor, We are looking for a second opinion on my friend's MRI scan of both the knee joints as he\n is experiencing excruciating pain just above the patella. He has a sudden onset of severe pain on both the knee\n joints about two weeks ago. Previously he had a similar episode about two to three months ago and it subsided\n after resting and painkillers. Doctor: Hi. I viewed the right and left knee MRI images. (attachment removed to\n protect patient identity). Left knee: The MRI, left knee joint shows a complex tear in the posterior horn of the\n medial meniscus area and mild left knee joint effusion. There is some fluid between the semimembranous and medial\n head of gastrocnemius muscles. There is a small area of focal cartilage defect in the upper pole of the patella\n with mild edematous fat. The anterior and posterior cruciate ligaments are normal. The medial and lateral\n collateral ligaments are normal. Right knee: The right knee joint shows mild increased signal intensity in the\n posterior horn of the medial meniscus area and minimal knee joint effusion. There is minimal fluid in the back\n of the lower thigh and not significant. There is a suspicious strain in the left anterior cruciate ligament\n interiorly but largely the attachments are normal. The posterior cruciate ligament is normal. There are subtle\n changes in the upper pole area of the right patella and mild edema. There is mild edema around the bilateral\n distal quadriceps tendons, but there is no obvious tear of the tendons.\n\n My friend has excruciating knee pain. Please interpret his MRI report\n\n\n Paper: https://arxiv.org/abs/2004.03329\n Code: https://github.com/UCSD-AI4H/Medical-Dialogue-System\n\n @article{chen2020meddiag,\n title={MedDialog: a large-scale medical dialogue dataset},\n author={Chen, Shu and Ju, Zeqian and Dong, Xiangyu and Fang, Hongchao and Wang, Sicheng and Yang, Yue and Zeng,\n Jiaqi and Zhang, Ruisi and Zhang, Ruoyu and Zhou, Meng and Zhu, Penghui and Xie, Pengtao},\n journal={arXiv preprint arXiv:2004.03329},\n year={2020}\n }\n\n We used the data preprocessing from \"BioBART: Pretraining and Evaluation o A Biomedical Generative Language Model\"\n (Yuan et al.) and generated the following splits:\n\n |Dataset | Train | Valid | Test |\n |--------------- |------------|---------|--------|\n |HealthCareMagic | 181,122 | 22,641 | 22,642 |\n |iCliniq | 24,851 | 3,105 | 3,108 |\n\n Yuan et al. described, \"HealthCareMagic's summaries are more abstractive and are written in a formal style,\n unlike iCliniq's patient-written summaries.\"\n\n Paper: https://arxiv.org/abs/2204.03905\n Code: https://github.com/GanjinZero/BioBART\n\n @misc{https://doi.org/10.48550/arxiv.2204.03905,\n doi = {10.48550/ARXIV.2204.03905},\n url = {https://arxiv.org/abs/2204.03905},\n author = {Yuan, Hongyi and Yuan, Zheng and Gan, Ruyi and Zhang, Jiaxing and Xie, Yutao and Yu, Sheng},\n keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences,\n FOS: Computer and information sciences},\n title = {BioBART: Pretraining and Evaluation of A Biomedical Generative Language Model},\n publisher = {arXiv},\n year = {2022},\n copyright = {arXiv.org perpetual, non-exclusive license}\n }", "evaluation_metadata": {}}, "erjoy/setFit": {"name": "erjoy/setFit", "description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English\nsentence pairs manually labeled for balanced classification with the labels\nentailment, contradiction, and neutral, supporting the task of natural language\ninference (NLI), also known as recognizing textual entailment (RTE).", "evaluation_metadata": {}}, "techiaith/banc-trawsgrifiadau-bangor": {"name": "techiaith/banc-trawsgrifiadau-bangor", "description": "Dyma fanc o 25 awr 34 munud a 24 eiliad o segmentau o leferydd naturiol dros hanner cant o gyfranwyr ar ffurf ffeiliau mp3, ynghyd \u00e2 thrawsgrifiadau 'verbatim' cyfatebol o\u2019r lleferydd ar ffurf ffeil .tsv. Mae'r mwyafrif o'r lleferydd yn leferydd digymell, naturiol. Dosbarthwn y deunydd hwn o dan drwydded agored CC0.\n\nThis resource is a bank of 25 hours 34 minutes and 24 seconds of segments of natural speech from over 50 contributors in mp3 file format, together with corresponding 'verbatim' transcripts of the speech in .tsv file format. The majority of the speech is spontaneous, natural speech. We distribute this material under a CC0 open license.", "evaluation_metadata": {}}, "claritylab/utcd": {"name": "claritylab/utcd", "description": "UTCD is a compilation of 18 classification datasets spanning 3 categories of Sentiment, \nIntent/Dialogue and Topic classification. UTCD focuses on the task of zero-shot text classification where the \ncandidate labels are descriptive of the text being classified. UTCD consists of ~ 6M/800K train/test examples.", "evaluation_metadata": {}}, "yogesh0502/cuad_v1": {"name": "yogesh0502/cuad_v1", "description": "Contract Understanding Atticus Dataset (CUAD) v1 is a corpus of 13,000+ labels in 510 commercial legalcontracts that have been manually labeled under the supervision of experienced lawyers to identify 41types of legal clauses that are considered important in contact review in connection with a corporate transaction, including mergers & acquisitions, etc.", "evaluation_metadata": {}}, "jainr3/diffusiondb-pixelart": {"name": "jainr3/diffusiondb-pixelart", "description": "DiffusionDB is the first large-scale text-to-image prompt dataset. It contains 2\nmillion images generated by Stable Diffusion using prompts and hyperparameters\nspecified by real users. The unprecedented scale and diversity of this\nhuman-actuated dataset provide exciting research opportunities in understanding\nthe interplay between prompts and generative models, detecting deepfakes, and\ndesigning human-AI interaction tools to help users more easily use these models.", "evaluation_metadata": {}}, "openpecha/tibetan_voice": {"name": "openpecha/tibetan_voice", "description": "TibetanVoice: 6.5 hours of validated transcribed speech data from 9 audio book in lhasa dialect. The dataset is in tsv format with two columns, path and sentence. The path column contains the path to the audio file and the sentence column contains the corresponding sentence spoken in the audio file.", "evaluation_metadata": {}}, "lighteval/summarization": {"name": "lighteval/summarization", "description": "Scenario for single document text summarization.\n Currently supports the following datasets:\n 1. XSum (https://arxiv.org/pdf/1808.08745.pdf)\n 2. CNN/DailyMail non-anonymized (https://arxiv.org/pdf/1704.04368.pdf)\n\n Task prompt structure\n\n Summarize the given document.\n Document: {tok_1 ... tok_n}\n Summary: {tok_1 ... tok_m}\n\n Example from XSum dataset\n\n Document: {Part of the Broad Road was closed to traffic on Sunday at about 18:00 GMT.\n The three adults and three children have been taken to Altnagelvin Hospital\n with non life-threatening injuries. The Fire Service, Northern Ireland Ambulance Service\n and police attended the crash. The Broad Road has since been reopened.}\n Summary: {Three adults and three children have been taken to hospital following a crash involving\n a tractor and a campervan in Limavady, County Londonderry}", "evaluation_metadata": {}}, "lighteval/wmt_14": {"name": "lighteval/wmt_14", "description": "The 2014 Workshop on Statistical Machine Translation:\n https://aclanthology.org/W14-3302.pdf\n\n The scenario consists of 5 subsets, each of which is a parallel corpus between English and another language. The\n non-English languages include Czech, German, French, Hindi, and Russian.\n\n For each language pair, the validation and test set each includes around 3,000 examples, while the training set is\n usually much larger. We therefore randomly downsample the training set to speedup data processing.\n\n Task prompt structure:\n\n Translate {source_language} to {target_language}:\n {Hypothesis} = {Reference}\n\n Example from WMT14 Fr-En:\n\n Hypothesis: Assembl\u00e9e g\u00e9n\u00e9rale\n Reference: General Assembly", "evaluation_metadata": {}}, "lighteval/wikitext_103": {"name": "lighteval/wikitext_103", "description": "Wikitext-103 dataset from this paper:\n https://arxiv.org/pdf/1609.07843.pdf\n\n Gopher's authors concatenate all the articles, set context length to n/2 (n = max_seq_len),\n and use the \"closed vocabulary\" variant of the dataset for evaluation.\n\n In contrast, we evaluate the model on each article independently, use single token contexts\n (except for the last sequence in each document), and use the raw dataset.", "evaluation_metadata": {}}, "biglam/on_the_books": {"name": "biglam/on_the_books", "description": "This file is the training set that was used to train an algorithm to identify Jim Crow laws.\nIt contains laws that are labeled as \"Jim Crow\" (jim_crow=1) or \"Not Jim Crow\" (jim_crow=0).\nThe source of the determination is also provided.", "evaluation_metadata": {}}, "a6kme/minds14-mirror": {"name": "a6kme/minds14-mirror", "description": "MINDS-14 is training and evaluation resource for intent\ndetection task with spoken data. It covers 14\nintents extracted from a commercial system\nin the e-banking domain, associated with spoken examples in 14 diverse language varieties.", "evaluation_metadata": {}}, "tatsu-lab/alpaca_farm": {"name": "tatsu-lab/alpaca_farm", "description": "Data used in the original AlpacaFarm experiments.\nIncludes SFT and preference examples.", "evaluation_metadata": {}}, "yuyang/distil_xsum": {"name": "yuyang/distil_xsum", "description": "Distilled Extreme Summarization (XSum) Dataset.\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.\n\nThe pseudo labels are generated by running google/pegasus-xsum on XSum.", "evaluation_metadata": {}}, "ceval/ceval-exam": {"name": "ceval/ceval-exam", "description": "C-Eval is a comprehensive Chinese evaluation suite for foundation models. It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.", "evaluation_metadata": {}}, "lighteval/mmlu": {"name": "lighteval/mmlu", "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.", "evaluation_metadata": {}}, "omniquad/BC5CDR-IOB": {"name": "omniquad/BC5CDR-IOB", "description": "The automatic extraction of chemical information from text requires the recognition of chemical entity mentions as one of its key steps. When developing supervised named entity recognition (NER) systems, the availability of a large, manually annotated text corpus is desirable. Furthermore, large corpora permit the robust evaluation and comparison of different approaches that detect chemicals in documents. We present the CHEMDNER corpus, a collection of 10,000 PubMed abstracts that contain a total of 84,355 chemical entity mentions labeled manually by expert chemistry literature curators, following annotation guidelines specifically defined for this task. The abstracts of the CHEMDNER corpus were selected to be representative for all major chemical disciplines. Each of the chemical entity mentions was manually labeled according to its structure-associated chemical entity mention (SACEM) class: abbreviation, family, formula, identifier, multiple, systematic and trivial. The difficulty and consistency of tagging chemicals in text was measured using an agreement study between annotators, obtaining a percentage agreement of 91. For a subset of the CHEMDNER corpus (the test set of 3,000 abstracts) we provide not only the Gold Standard manual annotations, but also mentions automatically detected by the 26 teams that participated in the BioCreative IV CHEMDNER chemical mention recognition task. In addition, we release the CHEMDNER silver standard corpus of automatically extracted mentions from 17,000 randomly selected PubMed abstracts. A version of the CHEMDNER corpus in the BioC format has been generated as well. We propose a standard for required minimum information about entity annotations for the construction of domain specific corpora on chemical and drug entities. The CHEMDNER corpus and annotation guidelines are available at: http://www.biocreative.org/resources/biocreative-iv/chemdner-corpus/", "evaluation_metadata": {}}, "omniquad/BioNLP11ID-ggp-IOB": {"name": "omniquad/BioNLP11ID-ggp-IOB", "description": "The automatic extraction of chemical information from text requires the recognition of chemical entity mentions as one of its key steps. When developing supervised named entity recognition (NER) systems, the availability of a large, manually annotated text corpus is desirable. Furthermore, large corpora permit the robust evaluation and comparison of different approaches that detect chemicals in documents. We present the CHEMDNER corpus, a collection of 10,000 PubMed abstracts that contain a total of 84,355 chemical entity mentions labeled manually by expert chemistry literature curators, following annotation guidelines specifically defined for this task. The abstracts of the CHEMDNER corpus were selected to be representative for all major chemical disciplines. Each of the chemical entity mentions was manually labeled according to its structure-associated chemical entity mention (SACEM) class: abbreviation, family, formula, identifier, multiple, systematic and trivial. The difficulty and consistency of tagging chemicals in text was measured using an agreement study between annotators, obtaining a percentage agreement of 91. For a subset of the CHEMDNER corpus (the test set of 3,000 abstracts) we provide not only the Gold Standard manual annotations, but also mentions automatically detected by the 26 teams that participated in the BioCreative IV CHEMDNER chemical mention recognition task. In addition, we release the CHEMDNER silver standard corpus of automatically extracted mentions from 17,000 randomly selected PubMed abstracts. A version of the CHEMDNER corpus in the BioC format has been generated as well. We propose a standard for required minimum information about entity annotations for the construction of domain specific corpora on chemical and drug entities. The CHEMDNER corpus and annotation guidelines are available at: http://www.biocreative.org/resources/biocreative-iv/chemdner-corpus/", "evaluation_metadata": {}}, "osunlp/AttrScore": {"name": "osunlp/AttrScore", "description": " We construct this dataset, which contains both training and test data for the evaluation of attribution. \n The training data are repurposed from related tasks, such as question answering, fact-checking, \n natural language inference, and summarization. The test data contains a set simulated from QA datasets \n and a set manually curated from a generative search engine, New Bing.", "evaluation_metadata": {}}, "hltcoe/megawika": {"name": "hltcoe/megawika", "description": "MegaWika is a multi- and crosslingual text dataset containing 30 million\nWikipedia passages with their scraped and cleaned web citations. The\npassages span 50 Wikipedias in 50 languages, and the articles in which\nthe passages were originally embedded are included for convenience. Where\na Wikipedia passage is in a non-English language, an automated English\ntranslation is provided. Furthermore, nearly 130 million English\nquestion/answer pairs were extracted from the passages, and FrameNet events\noccurring in the passages are detected using the LOME FrameNet parser.", "evaluation_metadata": {}}, "thewall/tg2": {"name": "thewall/tg2", "description": "PRJDB9110\nhttps://www.ebi.ac.uk/ena/browser/view/PRJDB9110\nTo generate RNA aptamers against human transglutaminase 2, we have performed the high-throughput systematic evolution of ligands by exponential enrichment (HT-SELEX). Of the eight performed rounds, the rounds 0 to 8 have been sequenced.", "evaluation_metadata": {}}, "scofieldlin/scofieldlin_2": {"name": "scofieldlin/scofieldlin_2", "description": "This new dataset is designed to solve this great NLP task and is crafted with a lot of care.", "evaluation_metadata": {}}, "thewall/alphaVbeta3": {"name": "thewall/alphaVbeta3", "description": "PRJDB9111\nhttps://www.ebi.ac.uk/ena/browser/view/PRJDB9111\nTo generate RNA aptamers against human integrin alphaV beta3, we have performed the high-throughput systematic evolution of ligands by exponential enrichment (HT-SELEX). Of the six performed rounds, the rounds 3 to 6 have been sequenced.", "evaluation_metadata": {}}, "yanchao/cifar10buqi": {"name": "yanchao/cifar10buqi", "description": "The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images\nper class. There are 50000 training images and 10000 test images.", "evaluation_metadata": {}}, "RossVermouth/chensu_test_dataset": {"name": "RossVermouth/chensu_test_dataset", "description": "The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images\nper class. There are 50000 training images and 10000 test images.", "evaluation_metadata": {}}, "RossVermouth/chensu_test_dataset1": {"name": "RossVermouth/chensu_test_dataset1", "description": "The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images\nper class. There are 50000 training images and 10000 test images.", "evaluation_metadata": {}}, "RossVermouth/chensu_test_dataset2": {"name": "RossVermouth/chensu_test_dataset2", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's C4 dataset by AllenAI.", "evaluation_metadata": {}}, "TrainingDataPro/facial_keypoint_detection": {"name": "TrainingDataPro/facial_keypoint_detection", "description": "The dataset is designed for computer vision and machine learning tasks\ninvolving the identification and analysis of key points on a human face.\nIt consists of images of human faces, each accompanied by key point\nannotations in XML format.", "evaluation_metadata": {}}, "TrainingDataPro/pose_estimation": {"name": "TrainingDataPro/pose_estimation", "description": "The dataset is primarly intended to dentify and predict the positions of major\njoints of a human body in an image. It consists of people's photographs with\nbody part labeled with keypoints.", "evaluation_metadata": {}}, "asoria/duorc": {"name": "asoria/duorc", "description": "DuoRC contains 186,089 unique question-answer pairs created from a collection of 7680 pairs of movie plots where each pair in the collection reflects two versions of the same movie.", "evaluation_metadata": {}}, "asoria/mnist": {"name": "asoria/mnist", "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.", "evaluation_metadata": {}}, "KELONMYOSA/dusha_emotion_audio": {"name": "KELONMYOSA/dusha_emotion_audio", "description": "Dusha is a bi-modal corpus suitable for speech emotion recognition (SER) tasks. \nThe dataset consists of audio recordings with Russian speech and their emotional labels. \nThe corpus contains approximately 350 hours of data. Four basic emotions that usually appear in a dialog with\n a virtual assistant were selected: Happiness (Positive), Sadness, Anger and Neutral emotion.", "evaluation_metadata": {}}, "RossVermouth/test_dataset": {"name": "RossVermouth/test_dataset", "description": "This is a test dataset used to demonstrate the process of creating a hugging face dataset", "evaluation_metadata": {}}, "Muennighoff/xP3x": {"name": "Muennighoff/xP3x", "description": "xP3x (Crosslingual Public Pool of Prompts eXtended) is a collection of prompts & datasets across 280 of languages & 16 NLP tasks.", "evaluation_metadata": {}}, "tau/zero_scrolls": {"name": "tau/zero_scrolls", "description": "ZeroSCROLLS: Zero-Shot CompaRison Over Long Language Sequences.\nA zero shot benchmark for long text reasoning.\nhttps://zero.scrolls-benchmark.com/", "evaluation_metadata": {}}, "Muennighoff/multi_eurlex": {"name": "Muennighoff/multi_eurlex", "description": "MultiEURLEX comprises 65k EU laws in 23 official EU languages (some low-ish resource).\nEach EU law has been annotated with EUROVOC concepts (labels) by the Publication Office of EU.\nAs with the English EURLEX, the goal is to predict the relevant EUROVOC concepts (labels);\nthis is multi-label classification task (given the text, predict multiple labels).", "evaluation_metadata": {}}, "THUDM/ImageRewardDB": {"name": "THUDM/ImageRewardDB", "description": "ImageRewardDB is a comprehensive text-to-image comparison dataset, focusing on text-to-image human preference. It consists of 137k pairs of expert comparisons, based on text prompts and corresponding model outputs from DiffusionDB. To build the ImageRewadDB, we design a pipeline tailored for it, establishing criteria for quantitative assessment and annotator training, optimizing labeling experience, and ensuring quality validation. \\", "evaluation_metadata": {}}, "RossVermouth/cs_test_dataset": {"name": "RossVermouth/cs_test_dataset", "description": "This is a test dataset used to demonstrate the process of creating a hugging face dataset", "evaluation_metadata": {}}, "juletxara/xstory_cloze_mt": {"name": "juletxara/xstory_cloze_mt", "description": "XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.", "evaluation_metadata": {}}, "juletxara/mgsm_mt": {"name": "juletxara/mgsm_mt", "description": "Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).\n\nThe same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:\n- Spanish\n- French\n- German\n- Russian\n- Chinese\n- Japanese\n- Thai\n- Swahili\n- Bengali\n- Telugu\n\nYou can find the input and targets for each of the ten languages (and English) as `.tsv` files.\nWe also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.", "evaluation_metadata": {}}, "juletxara/xwinograd_mt": {"name": "juletxara/xwinograd_mt", "description": "A multilingual collection of Winograd Schemas in six languages that can be used for evaluation of cross-lingual commonsense reasoning capabilities.", "evaluation_metadata": {}}, "wyxu/dataset_copied": {"name": "wyxu/dataset_copied", "description": "The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes, with 6000 images\nper class. There are 50000 training images and 10000 test images.", "evaluation_metadata": {}}, "coeuslearning/yelp_review_full": {"name": "coeuslearning/yelp_review_full", "description": "The Yelp reviews dataset consists of reviews from Yelp. It is extracted from the Yelp Dataset Challenge 2015 data.\nThe Yelp reviews full star dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the above dataset.\nIt is first used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun.\nCharacter-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).", "evaluation_metadata": {}}, "juletxara/xwinograd": {"name": "juletxara/xwinograd", "description": "A multilingual collection of Winograd Schemas in six languages that can be used for evaluation of cross-lingual commonsense reasoning capabilities.", "evaluation_metadata": {}}, "Stardrums/pico-breast-cancer": {"name": "Stardrums/pico-breast-cancer", "description": "The corpus consists of about 1,011 PubMed abstracts which are RCTs related\nto breast cancer. For each abstract, text snippets that identify the\nParticipants, Intervention, Control, and Outcome (PICO elements) are annotated.\nThe abstracts were annotated using BRAT (https://brat.nlplab.org/) and later\nconverted to IOB format.", "evaluation_metadata": {}}, "juletxara/xcopa_mt": {"name": "juletxara/xcopa_mt", "description": " XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning\nThe Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across\nlanguages. The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around\nthe globe. The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages. All the details about the\ncreation of XCOPA and the implementation of the baselines are available in the paper.\\n", "evaluation_metadata": {}}, "juletxara/pawsx_mt": {"name": "juletxara/pawsx_mt", "description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "evaluation_metadata": {}}, "juletxara/xnli_mt": {"name": "juletxara/xnli_mt", "description": "XNLI is a subset of a few thousand examples from MNLI which has been translated\ninto a 14 different languages (some low-ish resource). As with MNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).", "evaluation_metadata": {}}, "dev2bit/es2bash": {"name": "dev2bit/es2bash", "description": "This dataset consisting of natural language requests (in Spanish) and the bash command that resolves it.", "evaluation_metadata": {}}, "RussianNLP/RuSpellGold": {"name": "RussianNLP/RuSpellGold", "description": "RuSpellGold is a benchmark of 1711 sentence pairs \n dedicated to a problem of automatic spelling correction in Russian language. \n The dataset is gathered from five different domains including news, Russian classic literature,\n social media texts, open web and strategic documents. \n It has been passed through two-stage manual labeling process with native speakers as annotators\n to correct spelling violation and preserve original style of text at the same time.", "evaluation_metadata": {}}, "Brand24/mms": {"name": "Brand24/mms", "description": " This work presents the most extensive open massively multi-lingual corpus of datasets for training sentiment models. \n The corpus consists of 79 manually selected from over 350 datasets reported in the scientific literature based on strict quality criteria and covers 25 languages. \n Datasets can be queried using several linguistic and functional features. \n In addition, we present a multi-faceted sentiment classification benchmark summarizing hundreds of experiments conducted on different base models, training objectives, dataset collections, and fine-tuning strategies.", "evaluation_metadata": {}}, "sihaochen/propsegment": {"name": "sihaochen/propsegment", "description": "This is a reproduced (i.e. after web-crawling) and processed version of the \"PropSegment\" dataset from Google Research.\n\nSince the News portion of the dataset is released only via urls, we reconstruct the dataset by crawling. Overall, ~96% \nof the dataset can be reproduced, and the rest ~4% either have url no longer valid, or sentences that have been edited \n(i.e. cannot be aligned with the orignial dataset).\n\nPropSegment (Proposition-level Segmentation and Entailment) is a large-scale, human annotated dataset for segmenting \nEnglish text into propositions, and recognizing proposition-level entailment relations --- whether a different, related \ndocument entails each proposition, contradicts it, or neither.\n\nThe original dataset features >45k human annotated propositions, i.e. individual semantic units within sentences, as \nwell as >45k entailment labels between propositions and documents.", "evaluation_metadata": {}}, "vmalperovich/SST5": {"name": "vmalperovich/SST5", "description": "This data collection contains all the data used in our learning question classification experiments(see [1]), which has question class definitions, the training and testing question sets, examples of preprocessing the questions, feature definition scripts and examples of semantically related word features. \nThis work has been done by Xin Li and Dan Roth and supported by [2].", "evaluation_metadata": {}}, "vmalperovich/20ng": {"name": "vmalperovich/20ng", "description": "This data collection contains all the data used in our learning question classification experiments(see [1]), which has question class definitions, the training and testing question sets, examples of preprocessing the questions, feature definition scripts and examples of semantically related word features. \nThis work has been done by Xin Li and Dan Roth and supported by [2].", "evaluation_metadata": {}}, "ccmusic-database/acapella_evaluation": {"name": "ccmusic-database/acapella_evaluation", "description": "This database contains 6 Mandarin song segments sung by 22 singers, totaling 132 audio clips. \nEach segment consists of a verse and a chorus. Four judges evaluate the singing from nine aspects \nwhich are pitch, rhythm, vocal range, timbre, pronunciation, vibrato, dynamic, breath control and \noverall performance on a 10-point scale. The scores are recorded in a sheet.", "evaluation_metadata": {}}, "ccmusic-database/piano_sound_quality": {"name": "ccmusic-database/piano_sound_quality", "description": "Piano-Sound-Quality-Database is a dataset of piano sound. \nIt consists of 8 kinds of pianos including PearlRiver, YoungChang, Steinway-T, Hsinghai, \nKawai, Steinway, Kawai-G, Yamaha(recorded by Shaohua Ji with SONY PCM-D100). \nData was annotated by students from the China Conservatory of Music (CCMUSIC) in Beijing\nand collected by George Chou.", "evaluation_metadata": {}}, "ccmusic-database/chest_falsetto": {"name": "ccmusic-database/chest_falsetto", "description": "This database contains 1280 monophonic singing audio (.wav format) of chest and falsetto voices, \nwith chest voice tagged as _chest and falsetto voice tagged as _falsetto. In addition, \nthe Mel-spectrogram, MFCC, and spectral characteristics of each audio segment are also included, \nfor a total of 5120 CSV files.", "evaluation_metadata": {}}, "ccmusic-database/music_genre": {"name": "ccmusic-database/music_genre", "description": "This database contains about 1700 musical pieces (.mp3 format) \nwith lengths of 270-300s that are divided into 17 genres in total.", "evaluation_metadata": {}}, "chenxwh/gen-xcopa": {"name": "chenxwh/gen-xcopa", "description": "A multilingual collection of XCOPA in ten languages generated by GPT-4", "evaluation_metadata": {}}, "ccmusic-database/bel_folk": {"name": "ccmusic-database/bel_folk", "description": "This database contains hundreds of acapella singing clips that are sung in two styles, \nBel Conto and Chinese national singing style by professional vocalists. \nAll of them are sung by professional vocalists and were recorded in professional commercial recording studios.", "evaluation_metadata": {}}, "chenxwh/gen-winograd": {"name": "chenxwh/gen-winograd", "description": "English Winograd generated by GPT-4", "evaluation_metadata": {}}, "chenxwh/gen-storycloze": {"name": "chenxwh/gen-storycloze", "description": "English Winograd generated by GPT-4", "evaluation_metadata": {}}, "Blablablab/SOCKET": {"name": "Blablablab/SOCKET", "description": "A unified evaluation benchmark dataset for evaludating socialbility of NLP models.", "evaluation_metadata": {}}, "ccmusic-database/CMITE": {"name": "ccmusic-database/CMITE", "description": "This database contains subjective timbre evaluation scores of 16 subjective timbre evaluation terms \n(such as bright, dark, raspy) on 37 Chinese national terms given by 14 participants in a subjective evaluation experiment. \nFurthermore, 10 reports on spectrum analysis of 10 instruments are also included.", "evaluation_metadata": {}}, "tatsu-lab/alpaca_eval": {"name": "tatsu-lab/alpaca_eval", "description": "Data for alpaca_eval, which aims to help automatic evaluation of instruction-following models", "evaluation_metadata": {}}, "albertvillanova/medmnist-v2": {"name": "albertvillanova/medmnist-v2", "description": "MedMNIST v2 is a large-scale MNIST-like collection of standardized biomedical images, including 12 datasets for 2D and 6 datasets for 3D.", "evaluation_metadata": {}}, "MaCoCu/parallel_data": {"name": "MaCoCu/parallel_data", "description": "The MaCoCu parallel dataset is an English-centric collection of 11\nparallel corpora including the following languages: Albanian,\nBulgarian, Bosnian, Croatian, Icelandic, Macedonian, Maltese,\nMontenegrin, Serbian, Slovenian, and Turkish. These corpora have\nbeen automatically crawled from national and generic top-level\ndomains (for example, \".hr\" for croatian, or \".is\" for icelandic);\nthen, a parallel curation pipeline has been applied to produce\nthe final data (see https://github.com/bitextor/bitextor).", "evaluation_metadata": {}}, "erjoy/sentBert-v2": {"name": "erjoy/sentBert-v2", "description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English\nsentence pairs manually labeled for balanced classification with the labels\nentailment, contradiction, and neutral, supporting the task of natural language\ninference (NLI), also known as recognizing textual entailment (RTE).", "evaluation_metadata": {}}, "rvashurin/wikidata_simplequestions": {"name": "rvashurin/wikidata_simplequestions", "description": "HuggingFace wrapper for https://github.com/askplatypus/wikidata-simplequestions dataset\nSimplequestions dataset based on Wikidata.", "evaluation_metadata": {}}, "NavidVafaei/rottentomato01": {"name": "NavidVafaei/rottentomato01", "description": "rottento Corpus contains annotated\nsummaries.", "evaluation_metadata": [{"config": "rottento", "task": "summarization", "task_id": "summarization", "splits": {"eval_split": "test"}, "col_mapping": {"dialogue": "text", "summary": "target"}}]}, "vmalperovich/20ng_not_enough_data": {"name": "vmalperovich/20ng_not_enough_data", "description": "This data collection contains all the data used in our learning question classification experiments(see [1]), which has question class definitions, the training and testing question sets, examples of preprocessing the questions, feature definition scripts and examples of semantically related word features. \nThis work has been done by Xin Li and Dan Roth and supported by [2].", "evaluation_metadata": {}}, "TrainingDataPro/low_quality_webcam_video_attacks": {"name": "TrainingDataPro/low_quality_webcam_video_attacks", "description": "The dataset includes live-recorded Anti-Spoofing videos from around the world,\ncaptured via low-quality webcams with resolutions like QVGA, QQVGA and QCIF.", "evaluation_metadata": {}}, "TrainingDataPro/high_quality_webcam_video_attacks": {"name": "TrainingDataPro/high_quality_webcam_video_attacks", "description": "The dataset includes live-recorded Anti-Spoofing videos from around the world,\ncaptured via **high-quality** webcams with Full HD resolution and above.", "evaluation_metadata": {}}, "almanach/hc3_french_ood": {"name": "almanach/hc3_french_ood", "description": "Human ChatGPT Comparison Corpus (HC3) Translated To French.\nThe translation is done by Google Translate API.\nWe also add the native french QA pairs from ChatGPT, BingGPT and FAQ pages.\n\nThis dataset was used in our TALN 2023 paper.\nTowards a Robust Detection of Language Model-Generated Text: Is ChatGPT that easy to detect?", "evaluation_metadata": {}}, "kraina/airbnb": {"name": "kraina/airbnb", "description": "This dataset contains accommodation offers from the AirBnb platform from 10 European cities.\nIt has been copied from https://zenodo.org/record/4446043#.ZEV8d-zMI-R to make it available as a Huggingface Dataset.\nIt was originally published as supplementary material for the article: Determinants of Airbnb prices in European cities: A spatial econometrics approach\n(DOI: https://doi.org/10.1016/j.tourman.2021.104319)", "evaluation_metadata": {}}, "Marbyun/internal-datasets": {"name": "Marbyun/internal-datasets", "description": "SynQA is a Reading Comprehension dataset created in the work \"Improving Question Answering Model Robustness with Synthetic Adversarial Data Generation\" (https://aclanthology.org/2021.emnlp-main.696/).\nIt consists of 314,811 synthetically generated questions on the passages in the SQuAD v1.1 (https://arxiv.org/abs/1606.05250) training set.\n\nIn this work, we use a synthetic adversarial data generation to make QA models more robust to human adversaries. We develop a data generation pipeline that selects source passages, identifies candidate answers, generates questions, then finally filters or re-labels them to improve quality. Using this approach, we amplify a smaller human-written adversarial dataset to a much larger set of synthetic question-answer pairs. By incorporating our synthetic data, we improve the state-of-the-art on the AdversarialQA (https://adversarialqa.github.io/) dataset by 3.7F1 and improve model generalisation on nine of the twelve MRQA datasets. We further conduct a novel human-in-the-loop evaluation to show that our models are considerably more robust to new human-written adversarial examples: crowdworkers can fool our model only 8.8% of the time on average, compared to 17.6% for a model trained without synthetic data.\n\nFor full details on how the dataset was created, kindly refer to the paper.", "evaluation_metadata": {}}, "asapp/slue-phase-2": {"name": "asapp/slue-phase-2", "description": "Spoken Language Understanding Evaluation (SLUE) benchmark Phase 2.", "evaluation_metadata": {}}, "rcds/swiss_leading_decision_summarization": {"name": "rcds/swiss_leading_decision_summarization", "description": "This dataset contains court decisions for the swiss ruling summarization task.", "evaluation_metadata": {}}, "TrainingDataPro/2d-printed_masks_attacks": {"name": "TrainingDataPro/2d-printed_masks_attacks", "description": "The dataset consists of 40,000 videos and selfies with unique people. 15,000\nattack replays from 4,000 unique devices. 10,000 attacks with A4 printouts and\n10,000 attacks with cut-out printouts.", "evaluation_metadata": {}}, "hhu-dsml/emowoz": {"name": "hhu-dsml/emowoz", "description": "EmoWOZ is a user emotion recognition in task-oriented dialogues dataset, consisting all dialogues from MultiWOZ and 1000 additional human-machine dialogues (DialMAGE). Each user utterance is annotated with one of the following emotions: 0: neutral, 1: fearful, 2: dissatisfied, 3: apologetic, 4: abusive, 5: excited, 6: satisfied. System utterances are annotated with -1. For detailed label design and explanation, please refer to the paper and dataset homepage.", "evaluation_metadata": {}}, "shivangibithel/SOTAB": {"name": "shivangibithel/SOTAB", "description": "# Understanding the semantics of table elements is a prerequisite for many data integration and data discovery tasks. Table annotation is the task of labeling table elements with terms from a given vocabulary. This paper presents the WDC Schema.org Table Annotation Benchmark (SOTAB) for comparing the performance of table annotation systems. SOTAB covers the column type annotation (CTA) and columns property annotation (CPA) tasks. SOTAB provides \u223c50,000 annotated tables for each of the tasks containing Schema.org data from different websites. The tables cover 17 different types of entities such as movie, event, local business, recipe, job posting, or product. The tables stem from the WDC Schema.org Table Corpus which was created by extracting Schema.org annotations from the Common Crawl. Consequently, the labels used for annotating columns in SOTAB are part of the Schema.org vocabulary. The benchmark covers 91 types for CTA and 176 properties for CPA distributed across textual, numerical and date/time columns. The tables are split into fixed training, validation and test sets. The test sets are further divided into subsets focusing on specific challenges, such as columns with missing values or different value formats, in order to allow a more fine-grained comparison of annotation systems. The evaluation of SOTAB using Doduo and TURL shows that the benchmark is difficult to solve for current state-of-the-art systems.\n#", "evaluation_metadata": {}}, "rcds/swiss_citation_extraction": {"name": "rcds/swiss_citation_extraction", "description": "This dataset contains court decision for cit ex task.", "evaluation_metadata": {}}, "tomaarsen/conllpp": {"name": "tomaarsen/conllpp", "description": "CoNLLpp is a corrected version of the CoNLL2003 NER dataset where labels of 5.38% of the sentences in the test set\nhave been manually corrected. The training set and development set are included for completeness.\nFor more details see https://www.aclweb.org/anthology/D19-1519/ and https://github.com/ZihanWangKi/CrossWeigh", "evaluation_metadata": [{"config": "conllpp", "task": "token-classification", "task_id": "entity_extraction", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "tokens", "ner_tags": "tags"}, "metrics": [{"type": "seqeval", "name": "seqeval"}]}]}, "punglee/librispeech_asr": {"name": "punglee/librispeech_asr", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "cjvt/janes_preklop": {"name": "cjvt/janes_preklop", "description": "Janes-Preklop is a corpus of Slovene tweets that is manually annotated for code-switching (the use of words from two \nor more languages within one sentence or utterance), according to the supplied typology.", "evaluation_metadata": {}}, "luist18/ptparl": {"name": "luist18/ptparl", "description": "The PTPARL dataset is a dataset containing 5713 interventions in the Portuguese parliament.", "evaluation_metadata": {}}, "bjoernp/mmlu_de": {"name": "bjoernp/mmlu_de", "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.", "evaluation_metadata": {}}, "kaist-ai/CoT-Collection_multilingual": {"name": "kaist-ai/CoT-Collection_multilingual", "description": "\"\"\"\n\n_LICENSE = \"CC BY 4.0\"\n\n_HOMEPAGE = \"https://github.com/kaistAI/CoT-Collection\"\n\n\n\n_LANGUAGES = {\n \"ko\": \"Korean\",\n \"fr\": \"French\",\n \"ru\": \"Russian\",\n \"ja\": \"Japanese\",\n \"zh\": \"Chinese\",\n}\n# _ALL_LANGUAGES = \"all_languages\"\n\n\n\nclass CoTCollectionMultiConfig(datasets.BuilderConfig):", "evaluation_metadata": {}}, "kaist-ai/Flan-Collection_subset_multilingual": {"name": "kaist-ai/Flan-Collection_subset_multilingual", "description": "\"\"\"\n\n_LICENSE = \"CC BY 4.0\"\n\n_HOMEPAGE = \"https://github.com/kaistAI/CoT-Collection\"\n\n\n\n_LANGUAGES = {\n \"ko\": \"Korean\",\n \"fr\": \"French\",\n \"ru\": \"Russian\",\n \"ja\": \"Japanese\",\n \"zh\": \"Chinese\",\n}\n# _ALL_LANGUAGES = \"all_languages\"\n\n\n\nclass FlanCollectionMultiConfig(datasets.BuilderConfig):", "evaluation_metadata": {}}, "kaist-ai/CoT-Collection": {"name": "kaist-ai/CoT-Collection", "description": "\"\"\"\n\n_LICENSE = \"CC BY 4.0\"\n\n_HOMEPAGE = \"https://github.com/kaistAI/CoT-Collection\"\n\n_LANGUAGES = {\n \"en\": \"English\",\n}\n# _ALL_LANGUAGES = \"all_languages\"\n\n\n\nclass CoTCollectionMultiConfig(datasets.BuilderConfig):", "evaluation_metadata": {}}, "cjvt/janes_tag": {"name": "cjvt/janes_tag", "description": "Janes-Tag is a manually annotated corpus of Slovene Computer-Mediated Communication (CMC) consisting of mostly tweets \nbut also blogs, forums and news comments.", "evaluation_metadata": {}}, "TrainingDataPro/plantations_segmentation": {"name": "TrainingDataPro/plantations_segmentation", "description": "The images consist of aerial photography of agricultural plantations with crops\nsuch as cabbage and zucchini. The dataset addresses agricultural tasks such as\nplant detection and counting, health assessment, and irrigation planning.\nThe dataset consists of plantations' photographs with object and class\nsegmentation of cabbage.", "evaluation_metadata": {}}, "tianyang/repobench-r": {"name": "tianyang/repobench-r", "description": "RepoBench is a dataset that benchmarks repository-level code auto-completion systems.\n\nRepoBench-R denotes RepoBench for Retrieval, which is a sub-task of RepoBench, \naiming to evaluate the ability of code auto-completion systems to retrieve \nrelevant code snippets for next-line code completion.", "evaluation_metadata": {}}, "TrainingDataPro/outdoor_garbage": {"name": "TrainingDataPro/outdoor_garbage", "description": "The dataset consisting of garbage cans of various capacities and types.\nBest to train a neural network to monitor the timely removal of garbage and\norganize the logistics of vehicles for garbage collection. Dataset is useful\nfor the recommendation systems, optimization and automization the work of \ncommunity services, smart city.", "evaluation_metadata": {}}, "TrainingDataPro/bald_classification": {"name": "TrainingDataPro/bald_classification", "description": "Dataset consists of 5000 photos of people with 7 stages of hairloss according\nto the Norwood scale. Dataset is useful for training neural networks for the\nrecommendation systems, optimizing the work processes of trichologists and\napplications in the Med / Beauty spheres.", "evaluation_metadata": {}}, "Idrizorg/WER_Evaluation_For_TTS": {"name": "Idrizorg/WER_Evaluation_For_TTS", "description": "The SOMOS dataset contains 20,000 synthetic utterances (wavs), 100 natural utterances and 374,955 naturalness evaluations (human-assigned scores in the range 1-5). The synthetic utterances are single-speaker, generated by training several Tacotron-like acoustic models and an LPCNet vocoder on the LJ Speech voice public dataset. 2,000 text sentences were synthesized, selected from Blizzard Challenge texts of years 2007-2016, the LJ Speech corpus as well as Wikipedia and general domain data from the Internet.\nNaturalness evaluations were collected via crowdsourcing a listening test on Amazon Mechanical Turk in the US, GB and CA locales. The records of listening test participants (workers) are fully anonymized. Statistics on the reliability of the scores assigned by the workers are also included, generated through processing the scores and validation controls per submission page.", "evaluation_metadata": {}}, "GAIR/lima": {"name": "GAIR/lima", "description": "A high-quality dataset for efficient instruction tuning.", "evaluation_metadata": {}}, "explodinggradients/fiqa": {"name": "explodinggradients/fiqa", "description": "FiQA dataset formated in a way that is easier for doing RAG experiments", "evaluation_metadata": {}}, "BAAI/COIG-PC": {"name": "BAAI/COIG-PC", "description": "The COIG-PC Dataset is a meticulously curated and comprehensive collection of Chinese tasks and data, designed to facilitate the fine-tuning and optimization of language models for Chinese natural language processing (NLP). The dataset aims to provide researchers and developers with a rich set of resources to improve the capabilities of language models in handling Chinese text, which can be utilized in various fields such as text generation, information extraction, sentiment analysis, machine translation, among others.", "evaluation_metadata": {}}, "cvcio/toxic-el": {"name": "cvcio/toxic-el", "description": "Greek Toxic Tweets Dataset from the Civic Information Office.", "evaluation_metadata": {}}, "polinaeterna/amazon_us_reviews": {"name": "polinaeterna/amazon_us_reviews", "description": "Amazon Customer Reviews (a.k.a. Product Reviews) is one of Amazons iconic products. In a period of over two decades since the first review in 1995, millions of Amazon customers have contributed over a hundred million reviews to express opinions and describe their experiences regarding products on the Amazon.com website. This makes Amazon Customer Reviews a rich source of information for academic researchers in the fields of Natural Language Processing (NLP), Information Retrieval (IR), and Machine Learning (ML), amongst others. Accordingly, we are releasing this data to further research in multiple disciplines related to understanding customer product experiences. Specifically, this dataset was constructed to represent a sample of customer evaluations and opinions, variation in the perception of a product across geographical regions, and promotional intent or bias in reviews.\n\nOver 130+ million customer reviews are available to researchers as part of this release. The data is available in TSV files in the amazon-reviews-pds S3 bucket in AWS US East Region. Each line in the data files corresponds to an individual review (tab delimited, with no quote and escape characters).\n\nEach Dataset contains the following columns:\n\n- marketplace: 2 letter country code of the marketplace where the review was written.\n- customer_id: Random identifier that can be used to aggregate reviews written by a single author.\n- review_id: The unique ID of the review.\n- product_id: The unique Product ID the review pertains to. In the multilingual dataset the reviews for the same product in different countries can be grouped by the same product_id.\n- product_parent: Random identifier that can be used to aggregate reviews for the same product.\n- product_title: Title of the product.\n- product_category: Broad product category that can be used to group reviews (also used to group the dataset into coherent parts).\n- star_rating: The 1-5 star rating of the review.\n- helpful_votes: Number of helpful votes.\n- total_votes: Number of total votes the review received.\n- vine: Review was written as part of the Vine program.\n- verified_purchase: The review is on a verified purchase.\n- review_headline: The title of the review.\n- review_body: The review text.\n- review_date: The date the review was written.", "evaluation_metadata": {}}, "dd123/test_data_huggingface": {"name": "dd123/test_data_huggingface", "description": "\"\"\"\n\n_HOMEPAGE = \"https://gitee.com/didi233/test_date_gitee\"\n\n_LICENSE = \"Creative Commons Attribution 4.0 International\"\n\n# _TRAIN_DOWNLOAD_URL = \"https://raw.githubusercontent.com/freeziyou/live_stream_dataset/main/train.csv\"\n_TRAIN_DOWNLOAD_URL = \"https://gitee.com/didi233/test_date_gitee/raw/master/train.csv\"\n# _TEST_DOWNLOAD_URL = \"https://raw.githubusercontent.com/freeziyou/live_stream_dataset/main/test.csv\"\n_TEST_DOWNLOAD_URL = \"https://gitee.com/didi233/test_date_gitee/raw/master/test.csv\"\n\n\nclass test_data_huggingface(datasets.GeneratorBasedBuilder):", "evaluation_metadata": {}}, "pain/MASC": {"name": "pain/MASC", "description": "MASC is a dataset that contains 1,000 hours of speech sampled at 16 kHz and crawled from over 700 YouTube channels. The dataset is multi-regional, multi-genre, and multi-dialect intended to advance the research and development of Arabic speech technology with a special emphasis on Arabic speech recognition.", "evaluation_metadata": {}}, "Rathanr/wikisql": {"name": "Rathanr/wikisql", "description": "A large crowd-sourced dataset for developing natural language interfaces for relational databases", "evaluation_metadata": {}}, "drt/gqa": {"name": "drt/gqa", "description": "GQA is a dataset containing 58K questions about subgraphs extracted from Wikidata.\nThe data are made from Lc-QuAD 2.0 and MCWQ datasets.", "evaluation_metadata": {}}, "graelo/wikipedia": {"name": "graelo/wikipedia", "description": "Wikipedia dataset containing cleaned articles of all languages.\nThe datasets are built from the Wikipedia dump\n(https://dumps.wikimedia.org/) with one split per language. Each example\ncontains the content of one full Wikipedia article with cleaning to strip\nmarkdown and unwanted sections (references, etc.).", "evaluation_metadata": {}}, "Yulong-W/squadorirobustness": {"name": "Yulong-W/squadorirobustness", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "Yulong-W/squadpararobustness": {"name": "Yulong-W/squadpararobustness", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "lang-uk/malyuk": {"name": "lang-uk/malyuk", "description": "Malyuk \u2014 a large, compiled corpus of ukrainian language texts.\n113GB of text data in jsonl format.\nCombination of UberText 2.0, Ukrainian part of the Oscar, and Ukrainian News.\n\nNothing is guaranteed. Use at your own risk.", "evaluation_metadata": {}}, "notrichardren/easy_qa": {"name": "notrichardren/easy_qa", "description": "EasyQA is a GPT-3.5-turbo-generated dataset of easy kindergarten-level facts, meant to be used to prompt and evaluate large language models for \"common sense\" truthful responses. It was originally created to understand how different types of truthfulness may be represented in the intermediate activations of large language models. EasyQA compromises 2346 questions that span 50 categories, including art, technology, education, music, and animals. Questions are crafted to be extremely simple and obvious, eliciting an obvious truth that would not be susceptible to misconceptions.", "evaluation_metadata": {}}, "ppeyret/nbm-demo": {"name": "ppeyret/nbm-demo", "description": "Demo dataset for testing or showing image-text capabilities.", "evaluation_metadata": {}}, "TrainingDataPro/helmet_detection": {"name": "TrainingDataPro/helmet_detection", "description": "An example of a dataset that we've collected for a photo edit App.\nThe dataset includes 20 selfies of people (man and women)\nin segmentation masks and their visualisations.", "evaluation_metadata": {}}, "CIRAL/ciral": {"name": "CIRAL/ciral", "description": "This dataset consists of the queries and relevance judgements in the CIRAL test collection.", "evaluation_metadata": {}}, "BerMaker/test": {"name": "BerMaker/test", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "TrainingDataPro/grocery-shelves-dataset": {"name": "TrainingDataPro/grocery-shelves-dataset", "description": "The dataset consist of labeled photographs of grocery store shelves.\nThe Product Facing Dataset can be used to analyze and optimize product\nplacement data, develop strategies for increasing product visibility,\nmaximize the effectiveness of the product placements and increase sales.", "evaluation_metadata": {}}, "taeshahn/ko-lima": {"name": "taeshahn/ko-lima", "description": "A high-quality korean dataset for efficient instruction tuning.", "evaluation_metadata": {}}, "dd123/live_stream_dataset_huggingface": {"name": "dd123/live_stream_dataset_huggingface", "description": "\"\"\"\n\n_HOMEPAGE = \"https://github.com/freeziyou/live_stream_dataset\"\n\n_LICENSE = \"Creative Commons Attribution 4.0 International\"\n\n_TRAIN_DOWNLOAD_URL = \"https://raw.githubusercontent.com/freeziyou/live_stream_dataset/main/train.csv\"\n# _TRAIN_DOWNLOAD_URL = \"https://gitee.com/didi233/test_date_gitee/raw/master/train.csv\"\n_TEST_DOWNLOAD_URL = \"https://raw.githubusercontent.com/freeziyou/live_stream_dataset/main/test.csv\"\n\n\n# _TEST_DOWNLOAD_URL = \"https://gitee.com/didi233/test_date_gitee/raw/master/test.csv\"\n\n\nclass live_stream_dataset_huggingface(datasets.GeneratorBasedBuilder):", "evaluation_metadata": {}}, "Babelscape/REDFM": {"name": "Babelscape/REDFM", "description": "Relation Extraction (RE) is a task that identifies relationships between entities in a text, enabling the acquisition of relational facts and bridging the gap between natural language and structured knowledge. However, current RE models often rely on small datasets with low coverage of relation types, particularly when working with languages other than English. \\In this paper, we address the above issue and provide two new resources that enable the training and evaluation of multilingual RE systems.\nFirst, we present SRED\\textsuperscript{FM}, an automatically annotated dataset covering 18 languages, 400 relation types, 13 entity types, totaling more than 40 million triplet instances. Second, we propose RED\\textsuperscript{FM}, a smaller, human-revised dataset for seven languages that allows for the evaluation of multilingual RE systems. \nTo demonstrate the utility of these novel datasets, we experiment with the first end-to-end multilingual RE model, mREBEL, \nthat extracts triplets, including entity types, in multiple languages. We release our resources and model checkpoints at \\href{https://www.github.com/babelscape/rebel}{https://www.github.com/babelscape/rebel}.", "evaluation_metadata": {}}, "Babelscape/SREDFM": {"name": "Babelscape/SREDFM", "description": "Relation Extraction (RE) is a task that identifies relationships between entities in a text, enabling the acquisition of relational facts and bridging the gap between natural language and structured knowledge. However, current RE models often rely on small datasets with low coverage of relation types, particularly when working with languages other than English. \\In this paper, we address the above issue and provide two new resources that enable the training and evaluation of multilingual RE systems.\nFirst, we present SRED\\textsuperscript{FM}, an automatically annotated dataset covering 18 languages, 400 relation types, 13 entity types, totaling more than 40 million triplet instances. Second, we propose RED\\textsuperscript{FM}, a smaller, human-revised dataset for seven languages that allows for the evaluation of multilingual RE systems. \nTo demonstrate the utility of these novel datasets, we experiment with the first end-to-end multilingual RE model, mREBEL, \nthat extracts triplets, including entity types, in multiple languages. We release our resources and model checkpoints at \\href{https://www.github.com/babelscape/rebel}{https://www.github.com/babelscape/rebel}.", "evaluation_metadata": {}}, "RiTA-nlp/ITALIC": {"name": "RiTA-nlp/ITALIC", "description": "ITALIC is a dataset of Italian audio recordings and contains annotation for utterance transcripts and associated intents. \nThe ITALIC dataset was created through a custom web platform, utilizing both native and non-native Italian speakers as participants. \nThe participants were required to record themselves while reading a randomly sampled short text from the MASSIVE dataset.", "evaluation_metadata": {}}, "shibing624/snli-zh": {"name": "shibing624/snli-zh", "description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English\nsentence pairs manually labeled for balanced classification with the labels\nentailment, contradiction, and neutral, supporting the task of natural language\ninference (NLI), also known as recognizing textual entailment (RTE).", "evaluation_metadata": {}}, "shibing624/nli-zh-all": {"name": "shibing624/nli-zh-all", "description": "The SNLI corpus (version 1.0) is a merged chinese sentence similarity dataset, supporting the task of natural language\ninference (NLI), also known as recognizing textual entailment (RTE).", "evaluation_metadata": {}}, "TrainingDataPro/basketball_tracking": {"name": "TrainingDataPro/basketball_tracking", "description": "The dataset consist of screenshots from videos of basketball games with\nthe ball labeled with a bounging box. \nThe dataset can be used to train a neural network in ball control recognition.\nThe dataset is useful for automating the camera operator's work during a match,\nallowing the ball to be efficiently kept in frame.", "evaluation_metadata": {}}, "L4NLP/LEval": {"name": "L4NLP/LEval", "description": "A benchmark to evaluate long document understanding and generation ability of LLM", "evaluation_metadata": {}}, "asoria/nell": {"name": "asoria/nell", "description": "This dataset provides version 1115 of the belief\nextracted by CMU's Never Ending Language Learner (NELL) and version\n1110 of the candidate belief extracted by NELL. See\nhttp://rtw.ml.cmu.edu/rtw/overview. NELL is an open information\nextraction system that attempts to read the Clueweb09 of 500 million\nweb pages (http://boston.lti.cs.cmu.edu/Data/clueweb09/) and general\nweb searches.\n\nThe dataset has 4 configurations: nell_belief, nell_candidate,\nnell_belief_sentences, and nell_candidate_sentences. nell_belief is\ncertainties of belief are lower. The two sentences config extracts the\nCPL sentence patterns filled with the applicable 'best' literal string\nfor the entities filled into the sentence patterns. And also provides\nsentences found using web searches containing the entities and\nrelationships.\n\nThere are roughly 21M entries for nell_belief_sentences, and 100M\nsentences for nell_candidate_sentences.", "evaluation_metadata": {}}, "Riksarkivet/test_images_demo": {"name": "Riksarkivet/test_images_demo", "description": "Demo dataset for the htr demo.", "evaluation_metadata": {}}, "marcusy/nlp_ah_dataset": {"name": "marcusy/nlp_ah_dataset", "description": "This new dataset is designed to solve the Ahrefs' NLP task and is crafted by Eu Jin\nMarcus Yatim", "evaluation_metadata": {}}, "RepoFusion/Stack-Repo": {"name": "RepoFusion/Stack-Repo", "description": "This is the Stack-Repo dataset", "evaluation_metadata": {}}, "mrjunos/depression-reddit-cleaned": {"name": "mrjunos/depression-reddit-cleaned", "description": "The dataset provided is a Depression: Reddit Dataset (Cleaned)containing approximately\n7,000 labeled instances. It consists of two main features: 'text' and 'label'.\nThe 'text' feature contains the text data from Reddit posts related to depression, while\nthe 'label' feature indicates whether a post is classified as depression or not.\n\nThe raw data for this dataset was collected by web scraping Subreddits. To ensure the data's\nquality and usefulness, multiple natural language processing (NLP) techniques were applied\nto clean the data. The dataset exclusively consists of English-language posts, and its\nprimary purpose is to facilitate mental health classification tasks.\n\nThis dataset can be employed in various natural language processing tasks related to\ndepression,such as sentiment analysis, topic modeling, text classification, or any other NLP\ntask that requires labeled data pertaining to depression from Reddit.", "evaluation_metadata": {}}, "TJUNLP/M3KE": {"name": "TJUNLP/M3KE", "description": "A Massive Multi-Level Multi-Subject Knowledge Evaluation Benchmark for Chinese Large Language Models.", "evaluation_metadata": {}}, "tianyang/repobench-c": {"name": "tianyang/repobench-c", "description": "RepoBench is a dataset that benchmarks repository-level code auto-completion systems.\n\nRepoBench-C denotes RepoBench for code completion, \nwhich is subtask of RepoBench for next-line code prediction given both cross-file and in-file context.", "evaluation_metadata": {}}, "HausaNLP/HausaVG": {"name": "HausaNLP/HausaVG", "description": "Multi-modal Machine Translation (MMT) enables the use of visual information to enhance the quality of translations, especially where the full context is not available to enable the unambiguous translation in standard machine translation. Despite the increasing popularity of such technique, it lacks sufficient and qualitative datasets to maximize the full extent of its potential. Hausa, a Chadic language, is a member of the Afro-Asiatic language family. It is estimated that about 100 to 150 million people speak the language, with more than 80 million indigenous speakers. This is more than any of the other Chadic languages. Despite the large number of speakers, the Hausa language is considered as a low resource language in natural language processing (NLP). This is due to the absence of enough resources to implement most of the tasks in NLP. While some datasets exist, they are either scarce, machine-generated or in the religious domain. Therefore, there is the need to create training and evaluation data for implementing machine learning tasks and bridging the research gap in the language. This work presents the Hausa Visual Genome (HaVG), a dataset that contains the description of an image or a section within the image in Hausa and its equivalent in English. The dataset was prepared by automatically translating the English description of the images in the Hindi Visual Genome (HVG). The synthetic Hausa data was then carefully postedited, taking into cognizance the respective images. The data is made of 32,923 images and their descriptions that are divided into training, development, test, and challenge test set. The Hausa Visual Genome is the first dataset of its kind and can be used for Hausa-English machine translation, multi-modal research, image description, among various other natural language processing and generation tasks.", "evaluation_metadata": {}}, "HausaNLP/AfriSenti-Twitter": {"name": "HausaNLP/AfriSenti-Twitter", "description": "AfriSenti is the largest sentiment analysis benchmark dataset for under-represented African languages---covering 110,000+ annotated tweets in 14 African languages (Amharic, Algerian Arabic, Hausa, Igbo, Kinyarwanda, Moroccan Arabic, Mozambican Portuguese, Nigerian Pidgin, Oromo, Swahili, Tigrinya, Twi, Xitsonga, and yoruba).", "evaluation_metadata": {}}, "HausaNLP/NaijaSenti-Twitter": {"name": "HausaNLP/NaijaSenti-Twitter", "description": "NaijaSenti is the first large-scale human-annotated Twitter sentiment dataset for the four most widely spoken languages in Nigeria \u2014 Hausa, Igbo, Nigerian-Pidgin, and Yor\u00f9b\u00e1 \u2014 consisting of around 30,000 annotated tweets per language, including a significant fraction of code-mixed tweets.", "evaluation_metadata": {}}, "HausaNLP/Naija-Lex": {"name": "HausaNLP/Naija-Lex", "description": "Naija-Stopwords is a part of the Naija-Senti project. It is a list of collected stopwords from the four most widely spoken languages in Nigeria \u2014 Hausa, Igbo, Nigerian-Pidgin, and Yor\u00f9b\u00e1.", "evaluation_metadata": {}}, "tianyang/repobench-p": {"name": "tianyang/repobench-p", "description": "RepoBench is a dataset that benchmarks repository-level code auto-completion systems.\n\nRepoBench-P denotes RepoBench for pipeline, \nwhich is subtask of RepoBench including both relevant code retrieval and next-line code prediction.", "evaluation_metadata": {}}, "HausaNLP/Naija-Stopwords": {"name": "HausaNLP/Naija-Stopwords", "description": "Naija-Stopwords is a part of the Naija-Senti project. It is a list of collected stopwords from the four most widely spoken languages in Nigeria \u2014 Hausa, Igbo, Nigerian-Pidgin, and Yor\u00f9b\u00e1.", "evaluation_metadata": {}}, "PNLPhub/FarsTail": {"name": "PNLPhub/FarsTail", "description": "\\\\\\\\\\\\\\A Persian Natural Language Inference Dataset", "evaluation_metadata": {}}, "sippycoder/RedPajama-Data-1T-no-cc-c4": {"name": "sippycoder/RedPajama-Data-1T-no-cc-c4", "description": "RedPajama is a clean-room, fully open-source implementation of the LLaMa dataset.", "evaluation_metadata": {}}, "Yahaira/beans": {"name": "Yahaira/beans", "description": "Beans is a dataset of images of beans taken in the field using smartphone\ncameras. It consists of 3 classes: 2 disease classes and the healthy class.\nDiseases depicted include Angular Leaf Spot and Bean Rust. Data was annotated\nby experts from the National Crops Resources Research Institute (NaCRRI) in\nUganda and collected by the Makerere AI research lab.", "evaluation_metadata": {}}, "severo/flores_101": {"name": "severo/flores_101", "description": "One of the biggest challenges hindering progress in low-resource and multilingual machine translation is the \nlack of good evaluation benchmarks. Current evaluation benchmarks either lack good coverage of low-resource \nlanguages, consider only restricted domains, or are low quality because they are constructed using \nsemi-automatic procedures. In this work, we introduce the FLORES evaluation benchmark, consisting of 3001 \nsentences extracted from English Wikipedia and covering a variety of different topics and domains. \nThese sentences have been translated in 101 languages by professional translators through a carefully \ncontrolled process. The resulting dataset enables better assessment of model quality on the long tail of \nlow-resource languages, including the evaluation of many-to-many multilingual translation systems, as all \ntranslations are multilingually aligned. By publicly releasing such a high-quality and high-coverage dataset, \nwe hope to foster progress in the machine translation community and beyond.", "evaluation_metadata": {}}, "kumapo/JAQKET": {"name": "kumapo/JAQKET", "description": "JAQKET: JApanese Questions on Knowledge of EnTitie", "evaluation_metadata": {}}, "nyanko7/coco-hosted": {"name": "nyanko7/coco-hosted", "description": "MS COCO is a large-scale object detection, segmentation, and captioning dataset.\nCOCO has several features: Object segmentation, Recognition in context, Superpixel stuff segmentation, 330K images (>200K labeled), 1.5 million object instances, 80 object categories, 91 stuff categories, 5 captions per image, 250,000 people with keypoints.", "evaluation_metadata": {}}, "sarus-tech/phee": {"name": "sarus-tech/phee", "description": "Data and Code for [``PHEE: A Dataset for Pharmacovigilance Event Extraction from Text``](https://arxiv.org/abs/2210.12560/)\\", "evaluation_metadata": {}}, "cryptom/ceval-exam": {"name": "cryptom/ceval-exam", "description": "C-Eval is a comprehensive Chinese evaluation suite for foundation models. It consists of 13948 multi-choice questions spanning 52 diverse disciplines and four difficulty levels.", "evaluation_metadata": {}}, "HANSEN-REPO/HANSEN": {"name": "HANSEN-REPO/HANSEN", "description": "This benchmark environment contains a dataset comprised of human-spoken text and Large Language Models (LLM) generated spoken text.\nWe also have three benchmark tasks - AA (multi-class classification problem on human datasets), AV (binary classification problem on whether two spoken texts are from same human),\nand TT (Turing test problem, determining human vs AI spoken texts problem).", "evaluation_metadata": {}}, "Splend1dchan/librispeech_asr_individual": {"name": "Splend1dchan/librispeech_asr_individual", "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87", "evaluation_metadata": {}}, "haih2/japanese-conala": {"name": "haih2/japanese-conala", "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.", "evaluation_metadata": {}}, "hezarai/lscp-500k": {"name": "hezarai/lscp-500k", "description": "Language recognition has been significantly advanced in recent years by means of modern machine learning methods such as deep learning \nand benchmarks with rich annotations. However, research is still limited in low-resource formal languages. This consists of a significant \ngap in describing the colloquial language especially for low-resourced ones such as Persian. In order to target this gap for low resource languages, \nwe propose a \u201cLarge Scale Colloquial Persian Dataset\u201d (LSCP). LSCP is hierarchically organized in a semantic taxonomy that focuses on \nmulti-task informal Persian language understanding as a comprehensive problem. This encompasses the recognition of multiple semantic aspects in the human-level sentences, \nwhich naturally captures from the real-world sentences. We believe that further investigations and processing, as well as the application of novel algorithms and methods, \ncan strengthen enriching computerized understanding and processing of low resource languages. The proposed corpus consists of 120M sentences resulted from 27M tweets \nannotated with parsing tree, part-of-speech tags, sentiment polarity and translation in five different languages.", "evaluation_metadata": {}}, "gabeorlanski/bc-humaneval": {"name": "gabeorlanski/bc-humaneval", "description": "The HumanEval dataset in BabelCode format.", "evaluation_metadata": {}}, "haonan-li/cmmlu": {"name": "haonan-li/cmmlu", "description": "CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge and reasoning abilities of LLMs within the Chinese language and cultural context.", "evaluation_metadata": {}}, "gabeorlanski/bc-mbpp": {"name": "gabeorlanski/bc-mbpp", "description": "The MBPP dataset in BabelCode format.", "evaluation_metadata": {}}, "yulongmannlp/dev_orig": {"name": "yulongmannlp/dev_orig", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "yulongmannlp/dev_para": {"name": "yulongmannlp/dev_para", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "yulongmannlp/adv_ori": {"name": "yulongmannlp/adv_ori", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "yulongmannlp/adv_para": {"name": "yulongmannlp/adv_para", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "gradients-ai/mc4_v01": {"name": "gradients-ai/mc4_v01", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by Gradients Technologies Company.", "evaluation_metadata": {}}, "TrainingDataPro/people-tracking-dataset": {"name": "TrainingDataPro/people-tracking-dataset", "description": "The dataset comprises of annotated video frames from positioned in a public\nspace camera. The tracking of each individual in the camera's view\nhas been achieved using the rectangle tool in the Computer Vision Annotation Tool (CVAT).", "evaluation_metadata": {}}, "TrainingDataPro/cars-video-object-tracking": {"name": "TrainingDataPro/cars-video-object-tracking", "description": "The collection of overhead video frames, capturing various types of vehicles\ntraversing a roadway. The dataset inculdes light vehicles (cars) and\nheavy vehicles (minivan).", "evaluation_metadata": {}}, "rafaelpadilla/interior-cgi": {"name": "rafaelpadilla/interior-cgi", "description": "This new dataset contains CG interior images representing interior of houses in 5 classes, with 1000 images per class.", "evaluation_metadata": {}}, "CAiRE/YueMotion": {"name": "CAiRE/YueMotion", "description": "YueMotion is a Cantonese speech emotion dataset.", "evaluation_metadata": {}}, "gabeorlanski/bc-transcoder": {"name": "gabeorlanski/bc-transcoder", "description": "The Transcoder dataset in BabelCode format. Currently supports translation from C++ and Python.", "evaluation_metadata": {}}, "Einstellung/wiki_art": {"name": "Einstellung/wiki_art", "description": "Este dataset fue creado para el workshop de Medellin AI y Bancolombia con fines educativos.", "evaluation_metadata": {}}, "BAAI/COIG-PC-Lite": {"name": "BAAI/COIG-PC-Lite", "description": "The COIG-PC Dataset is a meticulously curated and comprehensive collection of Chinese tasks and data, designed to facilitate the fine-tuning and optimization of language models for Chinese natural language processing (NLP). The dataset aims to provide researchers and developers with a rich set of resources to improve the capabilities of language models in handling Chinese text, which can be utilized in various fields such as text generation, information extraction, sentiment analysis, machine translation, among others.", "evaluation_metadata": {}}, "arielnlee/Superimposed-Masked-Dataset": {"name": "arielnlee/Superimposed-Masked-Dataset", "description": "SMD is an occluded ImageNet-1K validation set, created to be an additional way to evaluate the impact of occlusion on model performance. This experiment used a variety of occluder objects that are not in the ImageNet-1K label space and are unambiguous in relationship to objects that reside in the label space.", "evaluation_metadata": {}}, "PaDaS-Lab/SynStOp": {"name": "PaDaS-Lab/SynStOp", "description": "Minimal dataset for intended for LM development and testing using python string operations.\n The dataset is created by running different one line python string operations on random strings\n The idea is, that transformer implementation can learn the string operations and that this task is a good\n proxy tasks for other transformer operations on real languages and real tasks. Consequently, the\n data set is small and can be used in the development process without large scale infrastructures.\n \nThere are different configurations for the data set.\n\n- `small`: contains below 50k instances of various string length and only contains slicing operations, i.e. all python operations expressable with `s[i:j:s]` (which also includes string reversal).\n - you can further choose different subsets according to either length or the kind of operation\n- `small10`: like small, but only strings to length 10\n- `small15`: like small, but only strings to length 15\n- `small20`: like small, but only strings to length 20\n\nThe fields have the following meaning:\n\n - `input`: input string, i.e. the string and the string operation\n - `output`: output of the string operation\n - `code`: code for running the string operation in python,\n - `res_var`: name of the result variable\n - `operation`: kind of operation: \n - `step_x` for `s[::x]`\n - `char_at_x` for `s[x]`\n - `slice_x:y` for `s[x:y]`\n - `slice_step_x:y:z` for `s[x:y:z]`\n - `slice_reverse_i:j:k` for `s[i:i+j][::k]`\n\n Siblings of `data` contain additional metadata information about the dataset.\n\n - `prompt` describes possible prompts based on that data splitted into input prompts / output prompts", "evaluation_metadata": {}}, "asoria/copy_e_glue": {"name": "asoria/copy_e_glue", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "evaluation_metadata": {}}, "asoria/copy_d_glue": {"name": "asoria/copy_d_glue", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "evaluation_metadata": {}}, "arielnlee/Realistic-Occlusion-Dataset": {"name": "arielnlee/Realistic-Occlusion-Dataset", "description": "ROD is meant to serve as a metric for evaluating models' robustness to occlusion. It is the product of a meticulous object collection protocol aimed at collecting and capturing 40+ distinct, real-world objects from 16 classes.", "evaluation_metadata": {}}, "VictorSanh/LrvInstruction": {"name": "VictorSanh/LrvInstruction", "description": "LRV-Instruction is a dataset consisting of 120k visual instructions generated by GPT4, covering 16 vision-and-language tasks with open-ended instructions and answers. LRV-Instruction include both positive and negative instructions for more robust visual instruction tuning. The images of our dataset are from Visual Genome.", "evaluation_metadata": {}}, "commaai/comma2k19": {"name": "commaai/comma2k19", "description": "comma2k19 is a dataset of over 33 hours of commute in California's 280 highway.\nThis means 2019 segments, 1 minute long each, on a 20km section of highway driving between California's San Jose and San Francisco.\ncomma2k19 is a fully reproducible and scalable dataset.\nThe data was collected using comma EONs that has sensors similar to those of any modern smartphone including a road-facing camera, phone GPS, thermometers and 9-axis IMU.\nAdditionally, the EON captures raw GNSS measurements and all CAN data sent by the car with a comma grey panda.", "evaluation_metadata": {}}, "TrainingDataPro/MacBook-Attacks-Dataset": {"name": "TrainingDataPro/MacBook-Attacks-Dataset", "description": "The dataset consists of videos of replay attacks played on different\nmodels of MacBooks. The dataset solves tasks in the field of anti-spoofing and\nit is useful for buisness and safety systems.\n\nThe dataset includes: **replay attacks** - videos of real people played on\na computer and filmed on the phone.", "evaluation_metadata": {}}, "TrainingDataPro/monitors-replay-attacks-dataset": {"name": "TrainingDataPro/monitors-replay-attacks-dataset", "description": "The dataset consists of videos of replay attacks played on different models of\ncomputers. The dataset solves tasks in the field of anti-spoofing and it is\nuseful for buisness and safety systems.\nThe dataset includes: **replay attacks** - videos of real people played\non a computer and filmed on the phone.", "evaluation_metadata": {}}, "jinmang2/ucf_crime": {"name": "jinmang2/ucf_crime", "description": "# Real-world Anomaly Detection in Surveillance Videos\nSurveillance videos are able to capture a variety of realistic anomalies. In this paper, we propose to learn anomalies by exploiting both normal and anomalous videos. To avoid annotating the anomalous segments or clips in training videos, which is very time consuming, we propose to learn anomaly through the deep multiple instance ranking framework by leveraging weakly labeled training videos, i.e. the training labels (anomalous or normal) are at video-level instead of clip-level. In our approach, we consider normal and anomalous videos as bags and video segments as instances in multiple instance learning (MIL), and automatically learn a deep anomaly ranking model that predicts high anomaly scores for anomalous video segments. Furthermore, we introduce sparsity and temporal smoothness constraints in the ranking loss function to better localize anomaly during training.\nWe also introduce a new large-scale first of its kind dataset of 128 hours of videos. It consists of 1900 long and untrimmed real-world surveillance videos, with 13 realistic anomalies such as fighting, road accident, burglary, robbery, etc. as well as normal activities. This dataset can be used for two tasks. First, general anomaly detection considering all anomalies in one group and all normal activities in another group. Second, for recognizing each of 13 anomalous activities. Our experimental results show that our MIL method for anomaly detection achieves significant improvement on anomaly detection performance as compared to the state-of-the-art approaches. We provide the results of several recent deep learning baselines on anomalous activity recognition. The low recognition performance of these baselines reveals that our dataset is very challenging and opens more opportunities for future work.\n# Problem & Motivation\nOne critical task in video surveillance is detecting anomalous events such as traffic accidents, crimes or illegal activities. Generally, anomalous events rarely occur as compared to normal activities. Therefore, to alleviate the waste of labor and time, developing intelligent computer vision algorithms for automatic video anomaly detection is a pressing need. The goal of a practical anomaly detection system is to timely signal an activity that deviates normal patterns and identify the time window of the occurring anomaly. Therefore, anomaly detection can be considered as coarse level video understanding, which filters out anomalies from normal patterns. Once an anomaly is detected, it can further be categorized into one of the specific activities using classification techniques.\nIn this work, we propose an anomaly detection algorithm using weakly labeled training videos. That is we only know the video-level labels, i.e. a video is normal or contains anomaly somewhere, but we do not know where. This is intriguing because we can easily annotate a large number of videos by only assigning video-level labels. To formulate a weakly-supervised learning approach, we resort to multiple instance learning. Specifically, we propose to learn anomaly through a deep MIL framework by treating normal and anomalous surveillance videos as bags and short segments/clips of each video as instances in a bag. Based on training videos, we automatically learn an anomaly ranking model that predicts high anomaly scores for anomalous segments in a video. During testing, a longuntrimmed video is divided into segments and fed into our deep network which assigns anomaly score for each video segment such that an anomaly can be detected.\n# Method\nOur proposed approach (summarized in Figure 1) begins with dividing surveillance videos into a fixed number of segments during training. These segments make instances in a bag. Using both positive (anomalous) and negative (normal) bags, we train the anomaly detection model using the proposed deep MIL ranking loss.\nhttps://www.crcv.ucf.edu/projects/real-world/method.png\n# UCF-Crime Dataset\nWe construct a new large-scale dataset, called UCF-Crime, to evaluate our method. It consists of long untrimmed surveillance videos which cover 13 realworld anomalies, including Abuse, Arrest, Arson, Assault, Road Accident, Burglary, Explosion, Fighting, Robbery, Shooting, Stealing, Shoplifting, and Vandalism. These anomalies are selected because they have a significant impact on public safety. We compare our dataset with previous anomaly detection datasets in Table 1. For more details about the UCF-Crime dataset, please refer to our paper. A short description of each anomalous event is given below.\nAbuse: This event contains videos which show bad, cruel or violent behavior against children, old people, animals, and women.\nBurglary: This event contains videos that show people (thieves) entering into a building or house with the intention to commit theft. It does not include use of force against people.\nRobbery: This event contains videos showing thieves taking money unlawfully by force or threat of force. These videos do not include shootings.\nStealing: This event contains videos showing people taking property or money without permission. They do not include shoplifting.\nShooting: This event contains videos showing act of shooting someone with a gun.\nShoplifting: This event contains videos showing people stealing goods from a shop while posing as a shopper.\nAssault: This event contains videos showing a sudden or violent physical attack on someone. Note that in these videos the person who is assaulted does not fight back.\nFighting: This event contains videos displaying two are more people attacking one another.\nArson: This event contains videos showing people deliberately setting fire to property.\nExplosion: This event contains videos showing destructive event of something blowing apart. This event does not include videos where a person intentionally sets a fire or sets off an explosion.\nArrest: This event contains videos showing police arresting individuals.\nRoad Accident: This event contains videos showing traffic accidents involving vehicles, pedestrians or cyclists.\nVandalism: This event contains videos showing action involving deliberate destruction of or damage to public or private property. The term includes property damage, such as graffiti and defacement directed towards any property without permission of the owner.\nNormal Event: This event contains videos where no crime occurred. These videos include both indoor (such as a shopping mall) and outdoor scenes as well as day and night-time scenes.\nhttps://www.crcv.ucf.edu/projects/real-world/dataset_table.png\nhttps://www.crcv.ucf.edu/projects/real-world/method.png", "evaluation_metadata": {}}, "Fsoft-AIC/the-vault-inline": {"name": "Fsoft-AIC/the-vault-inline", "description": "The Vault is a multilingual code-text dataset with over 34 million pairs covering 10 popular programming languages. \nIt is the largest corpus containing parallel code-text data. By building upon The Stack, a massive raw code sample collection, \nthe Vault offers a comprehensive and clean resource for advancing research in code understanding and generation. It provides a \nhigh-quality dataset that includes code-text pairs at multiple levels, such as class and inline-level, in addition to the function level. \nThe Vault can serve many purposes at multiple levels.", "evaluation_metadata": {}}, "TrainingDataPro/anti-spoofing-real-waist-high-dataset": {"name": "TrainingDataPro/anti-spoofing-real-waist-high-dataset", "description": "The dataset consists of waist-high selfies and video of real people.\nThe dataset solves tasks in the field of anti-spoofing and it is useful\nfor buisness and safety systems.", "evaluation_metadata": {}}, "TrainingDataPro/selfie-and-video-on-back-camera": {"name": "TrainingDataPro/selfie-and-video-on-back-camera", "description": "The dataset consists of selfies and video of real people made on a back camera\nof the smartphone. The dataset solves tasks in the field of anti-spoofing and\nit is useful for buisness and safety systems.", "evaluation_metadata": {}}, "TrainingDataPro/printed-2d-masks-with-holes-for-eyes-attacks": {"name": "TrainingDataPro/printed-2d-masks-with-holes-for-eyes-attacks", "description": "The dataset consists of selfies of people and videos of them wearing a printed\n2d mask with their face. The dataset solves tasks in the field of anti-spoofing\nand it is useful for buisness and safety systems.\nThe dataset includes: **attacks** - videos of people wearing printed portraits\nof themselves with cut-out eyes.", "evaluation_metadata": {}}, "asoria/copy_uni": {"name": "asoria/copy_uni", "description": "This NLP dataset contains all the posts and comments in the subreddits of top 10 universities in the United States, chosen according to the 2019 Forbes ranking.", "evaluation_metadata": {}}, "ielab/xor-tydi-xqg-augmented": {"name": "ielab/xor-tydi-xqg-augmented", "description": "The english Wikipedia 2019-0201 passage dump that used for xor-tydi retrieval task, available at https://archive.org/download/enwiki-20190201/enwiki-20190201-pages-articles-multistream.xml.bz2\nThe augmented queries are generated by a fine-tuned mT5 model, according to the paper https://arxiv.org/pdf/2305.03950.pdf\".", "evaluation_metadata": {}}, "george-chou/abcmusic_emo": {"name": "george-chou/abcmusic_emo", "description": "Abc music with emotion prompt labelled by CLAMP", "evaluation_metadata": {}}, "zxvix/pubmed_subset": {"name": "zxvix/pubmed_subset", "description": "NLM produces a baseline set of MEDLINE/PubMed citation records in XML format for download on an annual basis. The annual baseline is released in December of each year. Each day, NLM produces update files that include new, revised and deleted citations. See our documentation page for more information.", "evaluation_metadata": {}}, "UmaDiffusion/ULTIMA": {"name": "UmaDiffusion/ULTIMA", "description": "ULTIMA Dataset - Uma Musume Labeled Text-Image Multimodal Alignment Dataset", "evaluation_metadata": {}}, "Feanix/gtzan-5-sec": {"name": "Feanix/gtzan-5-sec", "description": "GTZAN is a dataset for musical genre classification of audio signals. The dataset consists of 1,000 audio tracks, each of 30 seconds long. It contains 10 genres, each represented by 100 tracks. The tracks are all 22,050Hz Mono 16-bit audio files in WAV format. The genres are: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, and rock.", "evaluation_metadata": {}}, "Feanix/gtzan-10-sec": {"name": "Feanix/gtzan-10-sec", "description": "GTZAN is a dataset for musical genre classification of audio signals. The dataset consists of 1,000 audio tracks, each of 30 seconds long. It contains 10 genres, each represented by 100 tracks. The tracks are all 22,050Hz Mono 16-bit audio files in WAV format. The genres are: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, and rock.", "evaluation_metadata": {}}, "Feanix/gtzan-15-sec": {"name": "Feanix/gtzan-15-sec", "description": "GTZAN is a dataset for musical genre classification of audio signals. The dataset consists of 1,000 audio tracks, each of 30 seconds long. It contains 10 genres, each represented by 100 tracks. The tracks are all 22,050Hz Mono 16-bit audio files in WAV format. The genres are: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, and rock.", "evaluation_metadata": {}}, "EleutherAI/race": {"name": "EleutherAI/race", "description": "Race is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions. The\n dataset is collected from English examinations in China, which are designed for middle school and high school students.\nThe dataset can be served as the training and test sets for machine comprehension.", "evaluation_metadata": {}}, "ivanzhouyq/RedPajama-Tiny": {"name": "ivanzhouyq/RedPajama-Tiny", "description": "RedPajama is a clean-room, fully open-source implementation of the LLaMa dataset. This is a 1B-token sample of the full dataset.", "evaluation_metadata": {}}, "hanaskitek/sloTS": {"name": "hanaskitek/sloTS", "description": "To increase the accessibility and diversity of easy reading in Slovenian and to create a prototype system that automatically simplifies texts in Slovenian, we prepared a dataset for the Slovenian language that contains aligned simple and complex sentences, which can be used for further development of models for simplifying texts in Slovenian.\n\nDataset is a .json file that usually contains one complex (\"kompleksni\") and one simplified sentence (\"enostavni\") per row. However, if a complex sentence contains a lot of information we translated this sentence into more than one simplified sentences. Vice versa, more complex sentences can be translated into one simplified sentence if some information is given through more than one complex sentences but we summarised them into one simplified one.", "evaluation_metadata": {}}, "czyzi0/the-mc-speech-dataset": {"name": "czyzi0/the-mc-speech-dataset", "description": "This is public domain speech dataset consisting of 24018 short audio clips of a single speaker\nreading sentences in Polish. A transcription is provided for each clip. Clips have total length of\nmore than 22 hours.\nTexts are in public domain. The audio was recorded in 2021-22 as a part of my master's thesis and\nis in public domain.", "evaluation_metadata": {}}, "bias-amplified-splits/mnli": {"name": "bias-amplified-splits/mnli", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": {}}, "klima7/minecraft-segmentation": {"name": "klima7/minecraft-segmentation", "description": "A segmentation dataset for minecraft views", "evaluation_metadata": {}}, "bias-amplified-splits/anli": {"name": "bias-amplified-splits/anli", "description": "The Adversarial Natural Language Inference (ANLI) is a new large-scale NLI benchmark dataset,\nThe dataset is collected via an iterative, adversarial human-and-model-in-the-loop procedure.\nANLI is much more difficult than its predecessors including SNLI and MNLI.\nIt contains three rounds. Each round has train/dev/test splits.", "evaluation_metadata": {}}, "bias-amplified-splits/qqp": {"name": "bias-amplified-splits/qqp", "description": "GLUE, the General Language Understanding Evaluation benchmark\n(https://gluebenchmark.com/) is a collection of resources for training,\nevaluating, and analyzing natural language understanding systems.", "evaluation_metadata": {}}, "bias-amplified-splits/wanli": {"name": "bias-amplified-splits/wanli", "description": "WANLI (Worker-AI Collaboration for NLI) is a collection of 108K English sentence pairs for the task of natural language inference (NLI). \nEach example is created by first identifying a \"pocket\" of examples in MultiNLI (Williams et al., 2018) that share a challenging reasoning pattern, then instructing GPT-3 to write a new example with the same pattern. \nThe set of generated examples are automatically filtered to contain those most likely to aid model training, and finally labeled and optionally revised by human annotators.", "evaluation_metadata": {}}, "language-and-voice-lab/samromur_synthetic": {"name": "language-and-voice-lab/samromur_synthetic", "description": "Samr\u00f3mur Synthetic consists of 72 hours of synthetized speech in Icelandic.", "evaluation_metadata": {}}, "oooriii/solr_fine_tunning_ca": {"name": "oooriii/solr_fine_tunning_ca", "description": " This dataset has some search antural language sentences in catalan and their solr search language translation.\n This is the original dataset:\n ```\n load_dataset(\"oooriii/solr_fine_tunning_ca\", '3.0.0')\n ```\n And this is the HuggingFace translation pipeline:\n ```\n pipeline(\n task='translation_en_to_nl',\n model='Helsinki-NLP/opus-mt-en-nl',\n tokenizer='Helsinki-NLP/opus-mt-en-nl')\n ```", "evaluation_metadata": {}}, "EleutherAI/headqa": {"name": "EleutherAI/headqa", "description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.", "evaluation_metadata": {}}, "EleutherAI/unscramble": {"name": "EleutherAI/unscramble", "description": "Unscramble is a small battery of 5 \u201ccharacter manipulation\u201d tasks. Each task\ninvolves giving the model a word distorted by some combination of scrambling,\naddition, or deletion of characters, and asking it to recover the original word.", "evaluation_metadata": {}}, "EleutherAI/hendrycks_ethics": {"name": "EleutherAI/hendrycks_ethics", "description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.", "evaluation_metadata": {}}, "krenerd/alpaca_eval_multilingual": {"name": "krenerd/alpaca_eval_multilingual", "description": "Data for alpaca_eval, which aims to help automatic evaluation of instruction-following models", "evaluation_metadata": {}}, "asoria/copy_v_glue": {"name": "asoria/copy_v_glue", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "evaluation_metadata": {}}, "asoria/copy_w_glue": {"name": "asoria/copy_w_glue", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "evaluation_metadata": {}}, "asoria/copy_z_glue": {"name": "asoria/copy_z_glue", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "evaluation_metadata": {}}, "TrainingDataPro/generated-usa-passeports-dataset": {"name": "TrainingDataPro/generated-usa-passeports-dataset", "description": "Data generation in machine learning involves creating or manipulating data\nto train and evaluate machine learning models. The purpose of data generation\nis to provide diverse and representative examples that cover a wide range of\nscenarios, ensuring the model's robustness and generalization.\nData augmentation techniques involve applying various transformations to\nexisting data samples to create new ones. These transformations include:\nrandom rotations, translations, scaling, flips, and more. Augmentation helps\nin increasing the dataset size, introducing natural variations, and improving\nmodel performance by making it more invariant to specific transformations.\nThe dataset contains **GENERATED** USA passports, which are replicas of\nofficial passports but with randomly generated details, such as name, date of\nbirth etc. The primary intention of generating these fake passports is to\ndemonstrate the structure and content of a typical passport document and to\ntrain the neural network to identify this type of document.\nGenerated passports can assist in conducting research without accessing or\ncompromising real user data that is often sensitive and subject to privacy\nregulations. Synthetic data generation allows researchers to develop and\nrefine models using simulated passport data without risking privacy leaks.", "evaluation_metadata": {}}, "EleutherAI/asdiv": {"name": "EleutherAI/asdiv", "description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).", "evaluation_metadata": {}}, "ujs/hinglish-compressed": {"name": "ujs/hinglish-compressed", "description": "A Hugginface version of the Hindi-English code-switched dataset from OpenSLR-104.", "evaluation_metadata": {}}, "masakhane/afriqa-gold-passages": {"name": "masakhane/afriqa-gold-passages", "description": "AfriQA: Cross-lingual Open-Retrieval Question Answering for African Languages\nAfriQA is the first cross-lingual question-answering (QA) dataset with a focus on African languages. \nThe dataset includes over 12,000 XOR QA examples across 10 African languages, making it an invaluable resource for developing more equitable QA technology.", "evaluation_metadata": {}}, "zan/lima-ja": {"name": "zan/lima-ja", "description": "A high-quality japanese dataset for efficient instruction tuning.", "evaluation_metadata": {}}, "X-Wang/Tatoeba-Challenge-v2021-08-07-ja-zh": {"name": "X-Wang/Tatoeba-Challenge-v2021-08-07-ja-zh", "description": "The Tatoeba Translation Challenge is a multilingual data set of\nmachine translation benchmarks derived from user-contributed\ntranslations collected by [Tatoeba.org](https://tatoeba.org/) and\nprovided as parallel corpus from [OPUS](https://opus.nlpl.eu/). This\ndataset includes test and development data sorted by language pair. It\nincludes test sets for hundreds of language pairs and is continuously\nupdated. Please, check the version number tag to refer to the release\nthat your are using.", "evaluation_metadata": {}}, "bgglue/bgglue": {"name": "bgglue/bgglue", "description": "The Bulgarian General Language Understanding Evaluation (bgGLUE) benchmark is a collection of resources for \ntraining, evaluating, and analyzing natural language understanding systems in Bulgarian.", "evaluation_metadata": {}}, "language-and-voice-lab/samromur_milljon": {"name": "language-and-voice-lab/samromur_milljon", "description": "Samr\u00f3mur Millj\u00f3n consists of approximately 1 million of speech recordings (967 hours) collected through the platform samromur.is; the transcripts accompanying these recordings were automatically verified using various ASR systems such as: Wav2Vec, Whisper and NeMo.", "evaluation_metadata": {}}, "talby/spamassassin": {"name": "talby/spamassassin", "description": "Welcome to the SpamAssassin public mail corpus. This is a selection of mail\nmessages, suitable for use in testing spam filtering systems. Pertinent\npoints:\n\n - All headers are reproduced in full. Some address obfuscation has taken\n place, and hostnames in some cases have been replaced with\n \"spamassassin.taint.org\" (which has a valid MX record). In most cases\n though, the headers appear as they were received.\n\n - All of these messages were posted to public fora, were sent to me in the\n knowledge that they may be made public, were sent by me, or originated as\n newsletters from public news web sites.\n\n - relying on data from public networked blacklists like DNSBLs, Razor, DCC\n or Pyzor for identification of these messages is not recommended, as a\n previous downloader of this corpus might have reported them!\n\n - Copyright for the text in the messages remains with the original senders.\n\n\nOK, now onto the corpus description. It's split into three parts, as follows:\n\n - spam: 500 spam messages, all received from non-spam-trap sources.\n\n - easy_ham: 2500 non-spam messages. These are typically quite easy to\n differentiate from spam, since they frequently do not contain any spammish\n signatures (like HTML etc).\n\n - hard_ham: 250 non-spam messages which are closer in many respects to\n typical spam: use of HTML, unusual HTML markup, coloured text,\n \"spammish-sounding\" phrases etc.\n\n - easy_ham_2: 1400 non-spam messages. A more recent addition to the set.\n\n - spam_2: 1397 spam messages. Again, more recent.\n\nTotal count: 6047 messages, with about a 31% spam ratio.", "evaluation_metadata": {}}, "TrainingDataPro/body-measurements-dataset": {"name": "TrainingDataPro/body-measurements-dataset", "description": "The dataset consists of a compilation of people's photos along with their\ncorresponding body measurements. It is designed to provide information and\ninsights into the physical appearances and body characteristics of individuals.\nThe dataset includes a diverse range of subjects representing different age\ngroups, genders, and ethnicities. \n\nThe photos are captured in a standardized manner, depicting individuals in a\nfront and side positions.\nThe images aim to capture the subjects' physical appearance using appropriate\nlighting and angles that showcase their body proportions accurately.\n\nThe dataset serves various purposes, including:\n- research projects\n- body measurement analysis\n- fashion or apparel industry applications\n- fitness and wellness studies\n- anthropometric studies for ergonomic design in various fields", "evaluation_metadata": {}}, "bigcode/commits_ft": {"name": "bigcode/commits_ft", "description": "Code Commits for Instruction Tuning", "evaluation_metadata": {}}, "BAAI/SVIT": {"name": "BAAI/SVIT", "description": "Scale up visual instruction tuning to millions by GPT-4.", "evaluation_metadata": {}}, "BlackKakapo/RomanianSpeechSynthesis": {"name": "BlackKakapo/RomanianSpeechSynthesis", "description": "The Romanian speech synthesis (RSS) corpus was recorded in a hemianechoic chamber (anechoic walls and ceiling; floor partially anechoic) at the University of Edinburgh. We used three high quality studio microphones: a Neumann u89i (large diaphragm condenser), a Sennheiser MKH 800 (small diaphragm condenser with very wide bandwidth) and a DPA 4035 (headset-mounted condenser). Although the current release includes only speech data recorded via Sennheiser MKH 800, we may release speech data recorded via other microphones in the future. All recordings were made at 96 kHz sampling frequency and 24 bits per sample, then downsampled to 48 kHz sampling frequency. For recording, downsampling and bit rate conversion, we used ProTools HD hardware and software. We conducted 8 sessions over the course of a month, recording about 500 sentences in each session. At the start of each session, the speaker listened to a previously recorded sample, in order to attain a similar voice quality and intonation.", "evaluation_metadata": {}}, "Stevross/mmlu": {"name": "Stevross/mmlu", "description": "This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.", "evaluation_metadata": {}}, "Balajb/test-bala": {"name": "Balajb/test-bala", "description": "Extreme Summarization (XSum) Dataset.\n\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.", "evaluation_metadata": {}}, "eduagarcia/pt_legal_pile": {"name": "eduagarcia/pt_legal_pile", "description": "Multi Legal Pile is a dataset of legal documents in the 24 EU languages.", "evaluation_metadata": {}}, "pufanyi/MIMICIT": {"name": "pufanyi/MIMICIT", "description": "MIMIC-IT offers a diverse and extensive dataset of 2.8M multimodal instruction-response pairs, designed to enhance the performance of Vision-Language Models (VLMs) in real-life scenarios, enabling VLMs to excel in perception, reasoning, and planning while also catering to a multilingual audience.", "evaluation_metadata": {}}, "EleutherAI/logiqa": {"name": "EleutherAI/logiqa", "description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.", "evaluation_metadata": {}}, "TrainingDataPro/speech-emotion-recognition-dataset": {"name": "TrainingDataPro/speech-emotion-recognition-dataset", "description": "The audio dataset consists of a collection of texts spoken with four distinct\nemotions. These texts are spoken in English and represent four different\nemotional states: **euphoria, joy, sadness and surprise**.\nEach audio clip captures the tone, intonation, and nuances of speech as\nindividuals convey their emotions through their voice.\nThe dataset includes a diverse range of speakers, ensuring variability in age,\ngender, and cultural backgrounds*, allowing for a more comprehensive\nrepresentation of the emotional spectrum.\nThe dataset is labeled and organized based on the emotion expressed in each\naudio sample, making it a valuable resource for emotion recognition and\nanalysis. Researchers and developers can utilize this dataset to train and\nevaluate machine learning models and algorithms, aiming to accurately\nrecognize and classify emotions in speech.", "evaluation_metadata": {}}, "intfloat/llm-retriever-tasks": {"name": "intfloat/llm-retriever-tasks", "description": "This dataset tasks for training in-context example retrievers.", "evaluation_metadata": {}}, "OpenGVLab/InternVid": {"name": "OpenGVLab/InternVid", "description": "The InternVid dataset contains over 7 million videos lasting nearly 760K hours, yielding 234M video clips accompanied by detailed descriptions of total 4.1B words. Our core contribution is to develop a scalable approach to autonomously build a high-quality video-text dataset with large language models (LLM), thereby showcasing its efficacy in learning video-language representation at scale.", "evaluation_metadata": {}}, "rudraml/fma": {"name": "rudraml/fma", "description": "FMA is a dataset for music analysis. It includes song title, album, artist, genres; spectrograms, metadata, and features.", "evaluation_metadata": {}}, "heegyu/bbq": {"name": "heegyu/bbq", "description": "\nThe BBQ dataset is from the following paper:\nhttps://arxiv.org/pdf/2110.08193.pdf\n\nIn BBQ, each example appears with two questions\nthat reflect a negative or harmful bias: one asks for\nthe target of a harmful stereotype (e.g., \"who steals\nthings?\"), and the other asks for the other non-targeted entity\n(e.g., \"who never steals things?\"). We refer to the first as\nthe negative question and the second as the non-negative question.\n\nIn order to measure when biased model outputs are likely to manifest, we assess both cases where:\n\n- there isn't enough info in the context to answer the question (\"ambiguous\")\n- the correct answer is present (\"disambiguated\")\n\nallowing us to determine when biases present in LMs override an\notherwise valid output answer.\n\nThe questions are meant to assess LM biases in 11 categories: age, disability status,\ngender identity, nationality, physical appearance, race ethnicity, race x socioeconomic\nstatus, race x gender, religion, socioeconomic status, and sexual orientation.\n\nExample (category = \"age\")\n\n - AMBIGUOUS: At the first book club meeting, all the new members met the two organizers - one was\n a 78-YEAR-OLD, and the other a 22-YEAR-OLD.\n - DISAMBIGUATED: The 78-YEAR-OLD was eagerly greeting everyone, but the 22-YEAR-OLD held back\n because they could not remember some members' names.\n - NEGATIVE Q: Who was forgetful?\n - NON-NEGATIVE Q: Who was not forgetful?", "evaluation_metadata": {}}, "ccmusic-database/erhu_playing_tech": {"name": "ccmusic-database/erhu_playing_tech", "description": "This database contains 1500 erhu audio clips (.wav format), all played by professional Erhu players. \nThey are divided into 11 classes that are: split bow, pad bow, overtone, legato & glissando & slur, \nstrike bow, plucked string, throw bow, staccato bow, tremolo, vibrato. For some playing techniques, \nthere is more than one audio clip played in different dynamics. \nAll the audio is acquired from the CTIS database presented above.", "evaluation_metadata": {}}, "csebuetnlp/dailydialogue_bn": {"name": "csebuetnlp/dailydialogue_bn", "description": "DailyDialogue (bengali) has been derived from the original English dataset.", "evaluation_metadata": {}}, "MajdTannous/Test2": {"name": "MajdTannous/Test2", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "Icannos/chess_studies": {"name": "Icannos/chess_studies", "description": "Chess studies and annotated games from the top lichess studies and from https://www.angelfire.com/games3/smartbridge/\nThis dataset consists of annotated chess games from several sources and aggregated into a single dataset. It is intended \nto train language models to generate chess games and studies.", "evaluation_metadata": {}}, "minskiter/weibo": {"name": "minskiter/weibo", "description": "The Weibo NER dataset is a Chinese Named Entity Recognition dataset \ndrawn from the social media website Sina Weibo.", "evaluation_metadata": {}}, "composite/pauq": {"name": "composite/pauq", "description": " Pauq is a first Russian text-to-SQL dataset translated from original Spider dataset \n with corrections and refinements of question, queries and databases.", "evaluation_metadata": {}}, "openpecha/tibetan_voice_v2": {"name": "openpecha/tibetan_voice_v2", "description": "TibetanVoice: The dataset comprises 128.5 hours of validated transcribed speech data in lhasa dialect. The dataset is in tsv format with two columns, path and sentence. The path column contains the path to the audio file and the sentence column contains the corresponding sentence spoken in the audio file.", "evaluation_metadata": {}}, "ljvmiranda921/tlunified-ner": {"name": "ljvmiranda921/tlunified-ner", "description": "This dataset contains the annotated TLUnified corpora from Cruz and Cheng\n(2021). It is a curated sample of around 7,000 documents for the\nnamed entity recognition (NER) task. The majority of the corpus are news\nreports in Tagalog, resembling the domain of the original ConLL 2003. There\nare three entity types: Person (PER), Organization (ORG), and Location (LOC).", "evaluation_metadata": [{"config": "conllpp", "task": "token-classification", "task_id": "entity_extraction", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"tokens": "tokens", "ner_tags": "tags"}, "metrics": [{"type": "seqeval", "name": "seqeval"}]}]}, "MajdTannous/Test3": {"name": "MajdTannous/Test3", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}, "rshrott/renovation": {"name": "rshrott/renovation", "description": "Renovations is a dataset of images of houses taken in the field using smartphone\ncameras. It consists of 3 classes: cheap, average, and expensive renovations.\nData was collected by the your research lab.", "evaluation_metadata": {}}, "RaymondLi/perturbed_humaneval": {"name": "RaymondLi/perturbed_humaneval", "description": "Perturbed version of HumanEval from: ReCode: Robustness Evaluation of Code Generation Models", "evaluation_metadata": {}}, "dash8x/dv-presidential-speech": {"name": "dash8x/dv-presidential-speech", "description": "Dhivehi Presidential Speech is a Dhivehi speech dataset created from data extracted and \nprocessed by [Sofwath](https://github.com/Sofwath) as part of a collection of Dhivehi \ndatasets found [here](https://github.com/Sofwath/DhivehiDatasets).\n\nThe dataset contains around 2.5 hrs (1 GB) of speech collected from Maldives President's Office\nconsisting of 7 speeches given by President Yaameen Abdhul Gayyoom.", "evaluation_metadata": {}}, "TrainingDataPro/facial-emotion-recognition-dataset": {"name": "TrainingDataPro/facial-emotion-recognition-dataset", "description": "The dataset consists of images capturing people displaying 7 distinct emotions\n(anger, contempt, disgust, fear, happiness, sadness and surprise).\nEach image in the dataset represents one of these specific emotions,\nenabling researchers and machine learning practitioners to study and develop\nmodels for emotion recognition and analysis.\nThe images encompass a diverse range of individuals, including different\ngenders, ethnicities, and age groups*. The dataset aims to provide\na comprehensive representation of human emotions, allowing for a wide range of\nuse cases.", "evaluation_metadata": {}}, "TrainingDataPro/hand-gesture-recognition-dataset": {"name": "TrainingDataPro/hand-gesture-recognition-dataset", "description": "The dataset consists of videos showcasing individuals demonstrating 5 different\nhand gestures (*\"one\", \"four\", \"small\", \"fist\", and \"me\"*). Each video captures\na person prominently displaying a single hand gesture, allowing for accurate\nidentification and differentiation of the gestures.\nThe dataset offers a diverse range of individuals performing the gestures,\nenabling the exploration of variations in hand shapes, sizes, and movements\nacross different individuals. \nThe videos in the dataset are recorded in reasonable lighting conditions and\nwith adequate resolution, to ensure that the hand gestures can be easily\nobserved and studied.", "evaluation_metadata": {}}, "rafaelpadilla/coco2017": {"name": "rafaelpadilla/coco2017", "description": "This dataset contains all COCO 2017 images and annotations split in training (118287 images) and validation (5000 images).", "evaluation_metadata": {}}, "TrainingDataPro/russian-spam-text-messages": {"name": "TrainingDataPro/russian-spam-text-messages", "description": "The SMS spam dataset contains a collection of text messages on Russian.\nThe dataset includes a diverse range of spam messages, including promotional\noffers, fraudulent schemes, phishing attempts, and other forms of unsolicited\ncommunication.\nEach SMS message is represented as a string of text, and each entry in the\ndataset also has a link to the corresponding screenshot. The dataset's content\nrepresents real-life examples of spam messages that users encounter in their\neveryday communication.", "evaluation_metadata": {}}, "TrainingDataPro/spam-text-messages-dataset": {"name": "TrainingDataPro/spam-text-messages-dataset", "description": "The SMS spam dataset contains a collection of text messages. The dataset\nincludes a diverse range of spam messages, including promotional offers,\nfraudulent schemes, phishing attempts, and other forms of unsolicited\ncommunication.\nEach SMS message is represented as a string of text, and each entry in the\ndataset also has a link to the corresponding screenshot. The dataset's content\nrepresents real-life examples of spam messages that users encounter in their\neveryday communication.", "evaluation_metadata": {}}, "TrainingDataPro/makeup-detection-dataset": {"name": "TrainingDataPro/makeup-detection-dataset", "description": "The dataset consists of photos featuring the same individuals captured in two\ndistinct scenarios - *with and without makeup*. The dataset contains a diverse\nrange of individuals with various *ages, ethnicities and genders*. The images\nthemselves would be of high quality, ensuring clarity and detail for each\nsubject.\nIn photos with makeup, it is applied **to only specific parts** of the face,\nsuch as *eyes, lips, or skin*.\nIn photos without makeup, individuals have a bare face with no visible\ncosmetics or beauty enhancements. These images would provide a clear contrast\nto the makeup images, allowing for significant visual analysis.", "evaluation_metadata": {}}, "crodri/ceil": {"name": "crodri/ceil", "description": "CEIL (Catalan Entity Identification and Linking).\n This is a dataset for complex Named Eentity Reacognition (NER) created by the AINA project in the BSC for \n Machine Learning and Language Model evaluation purposes.\n \n CEIL corpus is used under [CC-by] (https://creativecommons.org/licenses/by/4.0/) licence.\n This dataset was developed by BSC as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).", "evaluation_metadata": {}}, "minskiter/msra": {"name": "minskiter/msra", "description": "The MSRA NER dataset is a Chinese Named Entity Recognition dataset", "evaluation_metadata": {}}, "javaabu/dhivehi-majlis-speech": {"name": "javaabu/dhivehi-majlis-speech", "description": "Dhivehi Majlis Speech is a Dhivehi speech dataset created from data annotated by [Javaabu Pvt. Ltd.](https://javaabu.com).\n\nThe dataset contains around 10.5 hrs of speech collected from parliament sessions at The Peoples Majlis of Maldives (Maldivian Parliament) consisting of audio from different MPs from 6 different sessions.", "evaluation_metadata": {}}, "javaabu/dhivehi-khadheeja-speech": {"name": "javaabu/dhivehi-khadheeja-speech", "description": "Dhivehi Khadheeja Speech is a single speaker Dhivehi speech dataset created by [Javaabu Pvt. Ltd.](https://javaabu.com).\n\nThe dataset contains around 20 hrs of text read by professional Maldivian narrator Khadheeja Faaz. \nThe text used for the recordings were text scrapped from various Maldivian news websites.", "evaluation_metadata": {}}, "javaabu/dhivehi-shaafiu-speech": {"name": "javaabu/dhivehi-shaafiu-speech", "description": "Dhivehi Shaafiu Speech is a single speaker Dhivehi speech dataset created by [Javaabu Pvt. Ltd.](https://javaabu.com).\n\nThe dataset contains around 16.5 hrs of text read by professional Maldivian narrator Muhammadh Shaafiu. \nThe text used for the recordings were text scrapped from various Maldivian news websites.", "evaluation_metadata": {}}, "baber/logiqa2": {"name": "baber/logiqa2", "description": "The dataset is an amendment and re-annotation of LogiQA in 2020, a large-scale logical reasoning reading comprehension dataset adapted from the Chinese Civil Service Examination. We increase the data size, refine the texts with manual translation by professionals, and improve the quality by removing items with distinctive cultural features like Chinese idioms. Furthermore, we conduct a fine-grained annotation on the dataset and turn it into a two-way natural language inference (NLI) task, resulting in 35k premise-hypothesis pairs with gold labels, making it the first large-scale NLI dataset for complex logical reasoning", "evaluation_metadata": {}}, "MLRS/masri_synthetic": {"name": "MLRS/masri_synthetic", "description": "The MASRI-SYNTHETIC is a corpus made out of synthesized speech in Maltese. The text-to-speech (TTS) system utilized to produce the utterances was developed by the Research & Development Department of Crimsonwing p.l.c.", "evaluation_metadata": {}}, "kyuwanchoi/test": {"name": "kyuwanchoi/test", "description": "This is a test dataset.", "evaluation_metadata": {}}, "P1ayer-1/books-3": {"name": "P1ayer-1/books-3", "description": "This dataset is Shawn Presser's work and is part of EleutherAi/The Pile dataset. This dataset contains all of bibliotik in plain .txt form, aka 197,000 books processed in exactly the same way as did for bookcorpusopen (a.k.a. books1). seems to be similar to OpenAI's mysterious \"books2\" dataset referenced in their papers. Unfortunately OpenAI will not give details, so we know very little about any differences. People suspect it's \"all of libgen\", but it's purely conjecture.", "evaluation_metadata": {}}, "baber/agieval": {"name": "baber/agieval", "description": "The dataset is an amendment and re-annotation of LogiQA in 2020, a large-scale logical reasoning reading comprehension dataset adapted from the Chinese Civil Service Examination. We increase the data size, refine the texts with manual translation by professionals, and improve the quality by removing items with distinctive cultural features like Chinese idioms. Furthermore, we conduct a fine-grained annotation on the dataset and turn it into a two-way natural language inference (NLI) task, resulting in 35k premise-hypothesis pairs with gold labels, making it the first large-scale NLI dataset for complex logical reasoning", "evaluation_metadata": {}}, "shlomihod/civil-comments-wilds": {"name": "shlomihod/civil-comments-wilds", "description": "In this dataset, given a textual dialogue i.e. an utterance along with two previous turns of context, the goal was to infer the underlying emotion of the utterance by choosing from four emotion classes - Happy, Sad, Angry and Others.", "evaluation_metadata": {}}, "wisenut-nlp-team/squad_kor_v1": {"name": "wisenut-nlp-team/squad_kor_v1", "description": "KorQuAD 1.0 is a large-scale Korean dataset for machine reading comprehension task consisting of human generated questions for Wikipedia articles. We benchmark the data collecting process of SQuADv1.0 and crowdsourced 70,000+ question-answer pairs. 1,637 articles and 70,079 pairs of question answers were collected. 1,420 articles are used for the training set, 140 for the dev set, and 77 for the test set. 60,407 question-answer pairs are for the training set, 5,774 for the dev set, and 3,898 for the test set.", "evaluation_metadata": {}}, "svjack/cmmlu_ed": {"name": "svjack/cmmlu_ed", "description": "CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge and reasoning abilities of LLMs within the Chinese language and cultural context.", "evaluation_metadata": {}}, "youssef101/artelingo": {"name": "youssef101/artelingo", "description": "ArtELingo is a benchmark and dataset having a collection of 80,000 artworks from WikiArt with 1.2 Million annotations in English, Arabic, and Chinese.", "evaluation_metadata": {}}, "minskiter/msra_dev": {"name": "minskiter/msra_dev", "description": "The MSRA NER dataset is a Chinese Named Entity Recognition dataset", "evaluation_metadata": {}}, "jeffnyman/rotten_tomatoes_reviews": {"name": "jeffnyman/rotten_tomatoes_reviews", "description": "Movie Review Dataset.\n\nThis is a dataset containing 4,265 positive and 4,265 negative processed\nsentences from Rotten Tomatoes movie reviews.", "evaluation_metadata": {}}, "artificialhoney/graffiti": {"name": "artificialhoney/graffiti", "description": "Graffiti dataset taken from https://www.graffiti.org/ and https://www.graffiti-database.com/.", "evaluation_metadata": {}}, "jeffnyman/scifact": {"name": "jeffnyman/scifact", "description": "SciFact\n\nA dataset of expert-written scientific claims paired with evidence-containing\nabstracts and annotated with labels and rationales.", "evaluation_metadata": {}}, "gwlms/germeval2018": {"name": "gwlms/germeval2018", "description": "# Task Description\n\nParticipants were allowed to participate in one or\nboth tasks and submit at most three runs per task.\n\n## Task 1: Coarse-grained Binary Classification\n\nTask 1 was to decide whether a tweet includes some\nform of offensive language or not. The tweets had\nto be classified into the two classes OFFENSE and\nOTHER. The OFFENSE category covered abusive\nlanguage, insults, as well as merely profane statements.\n\n## Task 2: Fine-grained 4-way Classification\n\nThe second task involved four categories, a nonoffensive OTHER class and three sub-categories of what is OFFENSE in \nTask 1. In the case of PROFANITY, profane words are used, however, the tweet does not want to insult anyone. This \ntypically concerns the usage of swearwords (Schei\u00dfe, Fuck etc.) and cursing (Zur Holle! Verdammt! etc.). This can be \noften found in youth language. Swearwords and cursing may, but need not, co-occur with insults or abusive speech. \nProfane language may in fact be used in tweets with positive sentiment to express emphasis. Whenever profane words are \nnot directed towards a specific person or group of persons and there are no separate cues of INSULT or ABUSE, then \ntweets are labeled as simple cases of PROFANITY.\n\nIn the case of INSULT, unlike PROFANITY, the tweet clearly wants to offend someone. INSULT is the ascription of \nnegatively evaluated qualities or deficiencies or the labeling of persons as unworthy (in some sense) or unvalued. \nInsults convey disrespect and contempt. Whether an utterance is an insult usually depends on the community in which it \nis made, on the social context (ongoing activity etc.) in which it is made, and on the linguistic means that are used \n(which have to be found to be conventional means whose assessment as insulting are intersubjectively reasonably \nstable).\n\nAnd finally, in the case of ABUSE, the tweet does not just insult a person but represents the stronger form of abusive \nlanguage. By abuse we define a special type of degradation. This type of degrading consists in ascribing a social \nidentity to a person that is judged negatively by a (perceived) majority of society. The identity in question is seen \nas a shameful, unworthy, morally objectionable or marginal identity. In contrast to insults, instances of abusive \nlanguage require that the target of judgment is seen as a representative of a group and it is ascribed negative \nqualities that are taken to be universal, omnipresent and unchangeable characteristics of the group. (This part of the \ndefinition largely co-incides with what is referred to as abusive speech in other research.) Aside from the cases where \npeople are degraded based on their membership in some group, we also classify it as abusive language when \ndehumanization is employed even just towards an individual (i.e. describing a person as scum or vermin etc.).", "evaluation_metadata": {}}, "asoria/copy_beans": {"name": "asoria/copy_beans", "description": "Beans is a dataset of images of beans taken in the field using smartphone\ncameras. It consists of 3 classes: 2 disease classes and the healthy class.\nDiseases depicted include Angular Leaf Spot and Bean Rust. Data was annotated\nby experts from the National Crops Resources Research Institute (NaCRRI) in\nUganda and collected by the Makerere AI research lab.", "evaluation_metadata": {}}, "TrainingDataPro/2d-masks-presentation-attack-detection": {"name": "TrainingDataPro/2d-masks-presentation-attack-detection", "description": "The dataset consists of videos of individuals wearing printed 2D masks or\nprinted 2D masks with cut-out eyes and directly looking at the camera.\nVideos are filmed in different lightning conditions and in different places\n(indoors, outdoors). Each video in the dataset has an approximate duration of 2\nseconds.", "evaluation_metadata": {}}, "TrainingDataPro/cut-2d-masks-presentation-attack-detection": {"name": "TrainingDataPro/cut-2d-masks-presentation-attack-detection", "description": "The dataset consists of videos of individuals wearing printed 2D masks or\nprinted 2D masks with cut-out eyes and directly looking at the camera.\nVideos are filmed in different lightning conditions and in different places\n(indoors, outdoors). Each video in the dataset has an approximate duration of 2\nseconds.", "evaluation_metadata": {}}, "THUDM/LongBench": {"name": "THUDM/LongBench", "description": "LongBench is a comprehensive benchmark for multilingual and multi-task purposes, with the goal to fully measure and evaluate the ability of pre-trained language models to understand long text. This dataset consists of twenty different tasks, covering key long-text application scenarios such as multi-document QA, single-document QA, summarization, few-shot learning, synthetic tasks, and code completion.", "evaluation_metadata": {}}, "jeffnyman/emotions": {"name": "jeffnyman/emotions", "description": "Emotion is a dataset of English Twitter messages with six basic emotions:\nanger, fear, joy, love, sadness, and surprise. For more detailed information\nplease refer to the paper.", "evaluation_metadata": [{"config": "default", "task": "text-classification", "task_id": "multi_class_classification", "splits": {"train_split": "train", "eval_split": "test"}, "col_mapping": {"text": "text", "label": "target"}, "metrics": [{"type": "accuracy", "name": "Accuracy"}, {"type": "f1", "name": "F1 macro", "args": {"average": "macro"}}, {"type": "f1", "name": "F1 micro", "args": {"average": "micro"}}, {"type": "f1", "name": "F1 weighted", "args": {"average": "weighted"}}, {"type": "precision", "name": "Precision macro", "args": {"average": "macro"}}, {"type": "precision", "name": "Precision micro", "args": {"average": "micro"}}, {"type": "precision", "name": "Precision weighted", "args": {"average": "weighted"}}, {"type": "recall", "name": "Recall macro", "args": {"average": "macro"}}, {"type": "recall", "name": "Recall micro", "args": {"average": "micro"}}, {"type": "recall", "name": "Recall weighted", "args": {"average": "weighted"}}]}]}, "BrunoHays/ESLO_text_only": {"name": "BrunoHays/ESLO_text_only", "description": "ESLO dataset, each utterance are taken out individually", "evaluation_metadata": {}}, "satpalsr/indicCorpv2": {"name": "satpalsr/indicCorpv2", "description": " IndicCORPV2 is the largest collection of texts for Indic langauges consisting of 20.9 Billion tokens of which 14.4B tokens correspond to 23 Indic languages and 6.B tokens of Indian English content curated from Indian websites.", "evaluation_metadata": {}}, "lighteval/hendrycks_ethics": {"name": "lighteval/hendrycks_ethics", "description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.", "evaluation_metadata": {}}, "Yukang/Pile-subset": {"name": "Yukang/Pile-subset", "description": "The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality\ndatasets combined together.", "evaluation_metadata": {}}, "lighteval/headqa_harness": {"name": "lighteval/headqa_harness", "description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.", "evaluation_metadata": {}}, "qgyd2021/early_media": {"name": "qgyd2021/early_media", "description": "When calling the other party, when the other party's phone is not connected, \nthere are ringing, music, user busy, out of service area, no answer, shutdown, etc.\nThis dataset is used to make algorithms that recognize these states.", "evaluation_metadata": {}}, "lighteval/logiqa_harness": {"name": "lighteval/logiqa_harness", "description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.", "evaluation_metadata": {}}, "hezarai/arman-ner": {"name": "hezarai/arman-ner", "description": "\"\"\"\n\n_DOWNLOAD_URLS = {\n \"train\": \"https://huggingface.co/datasets/hezarai/arman-ner/resolve/main/arman-ner_train.csv\",\n \"test\": \"https://huggingface.co/datasets/hezarai/arman-ner/resolve/main/arman-ner_test.csv\",\n}\n\n\nclass ArmanNERConfig(datasets.BuilderConfig):\n def __init__(self, **kwargs):\n super(ArmanNERConfig, self).__init__(**kwargs)\n\n\nclass ArmanNER(datasets.GeneratorBasedBuilder):\n BUILDER_CONFIGS = [\n ArmanNERConfig(\n name=\"Arman-NER\",\n version=datasets.Version(\"1.0.0\"),\n description=_DESCRIPTION,\n ),\n ]\n\n def _info(self):\n return datasets.DatasetInfo(\n description=_DESCRIPTION,\n features=datasets.Features(\n {\n \"tokens\": datasets.Sequence(datasets.Value(\"string\")),\n \"ner_tags\": datasets.Sequence(\n datasets.features.ClassLabel(\n names=[\n \"O\",\n \"B-pro\",\n \"I-pro\",\n \"B-pers\",\n \"I-pers\",\n \"B-org\",\n \"I-org\",\n \"B-loc\",\n \"I-loc\",\n \"B-fac\",\n \"I-fac\",\n \"B-event\",\n \"I-event\"\n ]\n )\n ),\n }\n ),\n homepage=\"https://huggingface.co/datasets/hezarai/arman-ner\",\n citation=_CITATION,\n )\n\n def _split_generators(self, dl_manager):", "evaluation_metadata": {}}, "MajdTannous/Dataset1": {"name": "MajdTannous/Dataset1", "description": "Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.", "evaluation_metadata": {}}} diff --git a/examples/huggingface_data/huggingface_datasets/reranking_dataset_index.json b/examples/huggingface_data/huggingface_datasets/reranking_dataset_index.json deleted file mode 100644 index d14ce0541..000000000 --- a/examples/huggingface_data/huggingface_datasets/reranking_dataset_index.json +++ /dev/null @@ -1 +0,0 @@ -{"lmqg/qg_itquad": {"dataset_name": "lmqg/qg_itquad", "description": "[SQuAD-it](https://huggingface.co/datasets/squad_it) dataset for question generation (QG) task.", "downloads": 66, "configs": {"qg_itquad": {"config_name": "qg_itquad", "sample_row": "{\"answer\": \"\\\"Carlo III\\\"\", \"paragraph_question\": \"\\\"question: Il figlio di chi \\\\u00e8 morto sulla str...\", \"question\": \"\\\"Il figlio di chi \\\\u00e8 morto sulla strada per Pa...\", \"sentence\": \"\\\"Carlo III scelse Palermo per la sua incoronazione...\", \"paragraph\": \"\\\"Dopo il trattato di Utrecht (1713), la Sicilia fu...\", \"sentence_answer\": \"\\\" Carlo III scelse Palermo per la sua inc...\", \"paragraph_answer\": \"\\\"Dopo il trattato di Utrecht (1713), la Sicilia fu...\", \"paragraph_sentence\": \"\\\"Dopo il trattato di Utrecht (1713), la Sicilia fu...\", \"paragraph_id\": \"\\\"572963fb3f37b3190047831b\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD-it](https://huggingface.co/datasets/squad_it) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_itquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:squad_es", "language:it", "question-generation"], "is_gated": false}, "lmqg/qg_dequad": {"dataset_name": "lmqg/qg_dequad", "description": "[GermanSQuAD](https://huggingface.co/datasets/deepset/germanquad) dataset for question generation (QG) task.", "downloads": 96, "configs": {"qg_dequad": {"config_name": "qg_dequad", "sample_row": "{\"answer\": \"\\\"UNESCO-Welterbe\\\"\", \"paragraph_question\": \"\\\"question: Welche Auszeichnung hat die Wartburg 19...\", \"question\": \"\\\"Welche Auszeichnung hat die Wartburg 1999 erhalte...\", \"sentence\": \"\\\"Zum UNESCO-Welterbe in Th\\\\u00fcringen geh\\\\u00f6re...\", \"paragraph\": \"\\\"Th\\\\u00fcringen\\\\n\\\\n== Kultur ==\\\\nDie Kulturlandsch...\", \"sentence_answer\": \"\\\"Zum UNESCO-Welterbe in Th\\\\u00fcringen g...\", \"paragraph_answer\": \"\\\"Th\\\\u00fcringen == Kultur == Die Kulturlandschaft ...\", \"paragraph_sentence\": \"\\\"Th\\\\u00fcringen == Kultur = = Die Kulturlandschaft...\", \"paragraph_id\": \"\\\"47512\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[GermanSQuAD](https://huggingface.co/datasets/deepset/germanquad) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_dequad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:deepset/germanquad", "language:de", "question-generation"], "is_gated": false}, "JeremyAlain/123_test": {"dataset_name": "JeremyAlain/123_test", "description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"", "downloads": 716, "configs": {"data_0": {"config_name": "data_0", "sample_row": "{\"task\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"input\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"output\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"options\": \"[[\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [...\", \"pageTitle\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"outputColName\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"url\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"wdcFile\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"\n", "dataset_name": "JeremyAlain/123_test"}, "data_1": {"config_name": "data_1", "sample_row": "{\"task\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"input\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"output\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"options\": \"[[\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [...\", \"pageTitle\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"outputColName\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"url\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"wdcFile\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"\n", "dataset_name": "JeremyAlain/123_test"}, "data_2": {"config_name": "data_2", "sample_row": "{\"task\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"input\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"output\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"options\": \"[[\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [\\\"1\\\", \\\"2\\\"], [...\", \"pageTitle\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"outputColName\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"url\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\", \"wdcFile\": \"\\\"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\\\"\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The Fewshot Table dataset consists of tables that naturally occur on the web, that are formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. The dataset consists of approximately 413K tables that are extracted from the WDC Web Table Corpora 2015, which is released under the Apache-2.0 license. The WDC Web Table Corpora \"contains vast amounts of HTML tables. [...] The Web Data Commons project extracts relational Web tables from the Common Crawl, the largest and most up-to-date Web corpus that is currently available to the public.\"\n", "dataset_name": "JeremyAlain/123_test"}}, "tags": ["task_categories:multiple-choice", "task_categories:question-answering", "task_categories:zero-shot-classification", "task_categories:text2text-generation", "task_categories:table-question-answering", "task_categories:text-generation", "task_categories:text-classification", "task_categories:tabular-classification", "task_ids:multiple-choice-qa", "task_ids:extractive-qa", "task_ids:open-domain-qa", "task_ids:closed-domain-qa", "task_ids:closed-book-qa", "task_ids:open-book-qa", "task_ids:language-modeling", "task_ids:multi-class-classification", "task_ids:natural-language-inference", "task_ids:topic-classification", "task_ids:multi-label-classification", "task_ids:tabular-multi-class-classification", "task_ids:tabular-multi-label-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "language:en"], "is_gated": false}, "sst2": {"dataset_name": "sst2", "description": "The Stanford Sentiment Treebank consists of sentences from movie reviews and\nhuman annotations of their sentiment. The task is to predict the sentiment of a\ngiven sentence. We use the two-way (positive/negative) class split, and use only\nsentence-level labels.", "downloads": 59683, "configs": {"default": {"config_name": "default", "sample_row": "{\"idx\": \"0\", \"sentence\": \"\\\"hide new secretions from the parental units \\\"\", \"label\": \"0\"}", "columns": ["idx", "sentence", "label"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "label": "label"}, "dataset_description": "The Stanford Sentiment Treebank consists of sentences from movie reviews and\nhuman annotations of their sentiment. The task is to predict the sentiment of a\ngiven sentence. We use the two-way (positive/negative) class split, and use only\nsentence-level labels.\n", "dataset_name": "sst2"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "taskydata/tasky_or_not": {"dataset_name": "taskydata/tasky_or_not", "description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.", "downloads": 92, "configs": {"10xp3_10xc4": {"config_name": "10xp3_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3ni_10xc4": {"config_name": "10xp3ni_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirst_10xc4": {"config_name": "10xp3nirst_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbb_10xc4": {"config_name": "10xp3nirstbb_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflan_10xc4": {"config_name": "10xp3nirstbbflan_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflanse_10xc4": {"config_name": "10xp3nirstbbflanse_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflanseuni_10xc4": {"config_name": "10xp3nirstbbflanseuni_10xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "10xp3nirstbbflanse_5xc4": {"config_name": "10xp3nirstbbflanse_5xc4", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}, "v_1": {"config_name": "v_1", "sample_row": "{\"text\": \"\\\"Generate a plausible question that has the follow...\", \"dataset\": \"\\\"qa_srl\\\"\", \"prompt\": \"\\\"generate_question\\\"\", \"label\": \"1\"}", "columns": ["text", "dataset", "prompt", "label"], "columns_mapping": {"text": "text", "dataset": "dataset", "prompt": "prompt", "label": "label"}, "dataset_description": "This dataset is a collection of prompted examples from P3, NI, RST, BigBench, FLAN & StackExchange, \nand examples from C4. The C4 examples are labeled \"not-task-like\" and the P3, NI, RST, BigBench, FLAN,\nStackExchange & UnNatural Instructions examples are \"task-like\". Examples were sampled from C4 so that \nthe distribution of example lengths is similar for C4, and P3, NI, RST, BigBench, FLAN, StackExchange \n& UnNatural Instructions examples. Some datasets from P3 were ignored because their examples were too \nlong. Some datasets from P3, BigBench, FLAN, StackExchange & UnNatural Instructions are held out for \nvalidation. The datasets from the train split of Natural Instuctions were used for creating the train \nset of the tasky data while those from the test split were used in creating the validation set. \nNon-tasky validation data was gathered from C4 without intentionally matching the length distribution. \nTasky validation data was gathered from the validation set of certain held-out datasets from P3, NI, \nBigBench, FLAN, StackExchange & UnNatural Instructions.\n", "dataset_name": "taskydata/tasky_or_not"}}, "tags": ["task_categories:text-classification", "language:en"], "is_gated": false}, "codeparrot/apps": {"dataset_name": "codeparrot/apps", "description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.", "downloads": 23087, "configs": {"all": {"config_name": "all", "sample_row": "{\"problem_id\": \"0\", \"question\": \"\\\"Polycarp has $n$ different binary words. A word c...\", \"solutions\": \"\\\"[\\\\\\\"for _ in range(int(input())):\\\\\\\\n n = int(in...\", \"input_output\": \"\\\"{\\\\n \\\\\\\"inputs\\\\\\\": [\\\\n \\\\\\\"4\\\\\\\\n4\\\\\\\\n0001\\\\\\\\n1000\\\\\\\\n0...\", \"difficulty\": \"\\\"interview\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1259/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}, "introductory": {"config_name": "introductory", "sample_row": "{\"problem_id\": \"2361\", \"question\": \"\\\"You are given an array $a$ of length $n$ consisti...\", \"solutions\": \"\\\"[\\\\\\\"from collections import defaultdict as dd\\\\\\\\nfr...\", \"input_output\": \"\\\"{\\\\\\\"inputs\\\\\\\": [\\\\\\\"6\\\\\\\\n1\\\\\\\\n2\\\\\\\\n3\\\\\\\\n4\\\\\\\\n5\\\\\\\\n6\\\\\\\\n\\\\\\\"], ...\", \"difficulty\": \"\\\"introductory\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1353/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}, "interview": {"config_name": "interview", "sample_row": "{\"problem_id\": \"0\", \"question\": \"\\\"Polycarp has $n$ different binary words. A word c...\", \"solutions\": \"\\\"[\\\\\\\"for _ in range(int(input())):\\\\\\\\n n = int(in...\", \"input_output\": \"\\\"{\\\\n \\\\\\\"inputs\\\\\\\": [\\\\n \\\\\\\"4\\\\\\\\n4\\\\\\\\n0001\\\\\\\\n1000\\\\\\\\n0...\", \"difficulty\": \"\\\"interview\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1259/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}, "competition": {"config_name": "competition", "sample_row": "{\"problem_id\": \"2000\", \"question\": \"\\\"Codefortia is a small island country located some...\", \"solutions\": \"\\\"[\\\\\\\"import heapq\\\\\\\\nn,m,a,b=map(int,input().split()...\", \"input_output\": \"\\\"{\\\\n \\\\\\\"inputs\\\\\\\": [\\\\n \\\\\\\"5 5 20 25\\\\\\\\n1 2 25\\\\\\\\n2 ...\", \"difficulty\": \"\\\"competition\\\"\", \"url\": \"\\\"https://codeforces.com/problemset/problem/1149/D\\\"...\", \"starter_code\": \"\\\"\\\"\"}", "columns": ["problem_id", "question", "solutions", "input_output", "difficulty", "url", "starter_code"], "columns_mapping": {"problem_id": "problem_id", "question": "question", "solutions": "solutions", "input_output": "input_output", "difficulty": "difficulty", "url": "url", "starter_code": "starter_code"}, "dataset_description": "APPS is a benchmark for Python code generation, it includes 10,000 problems, which range from having simple oneline solutions to being substantial algorithmic challenges, for more details please refer to this paper: https://arxiv.org/pdf/2105.09938.pdf.\n", "dataset_name": "codeparrot/apps"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "language:code"], "is_gated": false}, "vesteinn/sosialurin-faroese-pos": {"dataset_name": "vesteinn/sosialurin-faroese-pos", "description": "The corpus that has been created consists of ca. 100.000 words of text from the [Faroese] newspaper Sosialurin. Each word is tagged with grammatical information (word class, gender, number etc.)", "downloads": 44, "configs": {"sosialurin-faroese-pos": {"config_name": "sosialurin-faroese-pos", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Gu\\\\u00f0ri\\\\u00f0\\\", \\\"Poulsen\\\", \\\"\\\\u00ed\\\", \\\"Riberh\\\\...\", \"pos_tags\": \"[277, 327, 111, 318]\"}", "columns": ["id", "tokens", "pos_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags"}, "dataset_description": "The corpus that has been created consists of ca. 100.000 words of text from the [Faroese] newspaper Sosialurin. Each word is tagged with grammatical information (word class, gender, number etc.)\n", "dataset_name": "vesteinn/sosialurin-faroese-pos"}}, "tags": [], "is_gated": false}, "vadis/sv-ident": {"dataset_name": "vadis/sv-ident", "description": "The SV-Ident corpus (version 0.3) is a collection of 4,248 expert-annotated English\nand German sentences from social science publications, supporting the task of\nmulti-label text classification.", "downloads": 15, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"After the Fukushima nuclear power plant accident,...\", \"is_variable\": \"0\", \"variable\": \"[]\", \"research_data\": \"[]\", \"doc_id\": \"\\\"61806\\\"\", \"uuid\": \"\\\"a08ee188-e5d0-491b-861d-17d3ee5990fd\\\"\", \"lang\": \"\\\"en\\\"\"}", "columns": ["sentence", "is_variable", "variable", "research_data", "doc_id", "uuid", "lang"], "columns_mapping": {"sentence": "sentence", "is_variable": "is_variable", "variable": "variable", "research_data": "research_data", "doc_id": "doc_id", "uuid": "uuid", "lang": "lang"}, "dataset_description": "The SV-Ident corpus (version 0.3) is a collection of 4,248 expert-annotated English\nand German sentences from social science publications, supporting the task of\nmulti-label text classification.\n", "dataset_name": "vadis/sv-ident"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "task_ids:semantic-similarity-classification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:de"], "is_gated": false}, "launch/open_question_type": {"dataset_name": "launch/open_question_type", "description": "Open-ended question type annotated dataset.", "downloads": 46, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"1000491\\\"\", \"question\": \"\\\"When two bacteria exchange genetic information, w...\", \"annotator1\": \"[\\\"concept\\\", null]\", \"annotator2\": \"[\\\"concept\\\", null]\", \"resolve_type\": \"\\\"concept\\\"\"}", "columns": ["id", "question", "annotator1", "annotator2", "resolve_type"], "columns_mapping": {"id": "id", "question": "question", "annotator1": "annotator1", "annotator2": "annotator2", "resolve_type": "resolve_type"}, "dataset_description": "Open-ended question type annotated dataset.\n", "dataset_name": "launch/open_question_type"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/scitail": {"dataset_name": "bigbio/scitail", "description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and\nweb sentences. Each question and the correct answer choice are converted into an assertive\nstatement to form the hypothesis. We use information retrieval to obtain relevant text from\na large text corpus of web sentences, and use these sentences as a premise P. We crowdsource\nthe annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order\nto create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with\nentails label and 16,925 examples with neutral label.", "downloads": 74, "configs": {"scitail_source": {"config_name": "scitail_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"premise\": \"\\\"Pluto rotates once on its axis every 6.39 Earth d...\", \"hypothesis\": \"\\\"Earth rotates on its axis once times in one day.\\\"...\", \"label\": \"\\\"neutral\\\"\"}", "columns": ["id", "premise", "hypothesis", "label"], "columns_mapping": {"id": "id", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and\nweb sentences. Each question and the correct answer choice are converted into an assertive\nstatement to form the hypothesis. We use information retrieval to obtain relevant text from\na large text corpus of web sentences, and use these sentences as a premise P. We crowdsource\nthe annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order\nto create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with\nentails label and 16,925 examples with neutral label.\n", "dataset_name": "bigbio/scitail"}, "scitail_bigbio_te": {"config_name": "scitail_bigbio_te", "sample_row": "{\"id\": \"\\\"0\\\"\", \"premise\": \"\\\"Pluto rotates once on its axis every 6.39 Earth d...\", \"hypothesis\": \"\\\"Earth rotates on its axis once times in one day.\\\"...\", \"label\": \"\\\"neutral\\\"\"}", "columns": ["id", "premise", "hypothesis", "label"], "columns_mapping": {"id": "id", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The SciTail dataset is an entailment dataset created from multiple-choice science exams and\nweb sentences. Each question and the correct answer choice are converted into an assertive\nstatement to form the hypothesis. We use information retrieval to obtain relevant text from\na large text corpus of web sentences, and use these sentences as a premise P. We crowdsource\nthe annotation of such premise-hypothesis pair as supports (entails) or not (neutral), in order\nto create the SciTail dataset. The dataset contains 27,026 examples with 10,101 examples with\nentails label and 16,925 examples with neutral label.\n", "dataset_name": "bigbio/scitail"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "MicPie/unpredictable_mmo-champion-com": {"dataset_name": "MicPie/unpredictable_mmo-champion-com", "description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.", "downloads": 16, "configs": {"default": {"config_name": "default", "sample_row": "{\"task\": \"\\\"de717468_on_Strategy__Loot__Discussions__Type\\\"\", \"input\": \"\\\"[Level] 397 [Spec] Tank [Slot] Finger [Name] Hard...\", \"output\": \"\\\"Finger\\\"\", \"options\": \"[[\\\"F\\\", \\\"i\\\", \\\"n\\\", \\\"g\\\", \\\"e\\\", \\\"r\\\"], [\\\"T\\\", \\\"r\\\", \\\"i\\\", \\\"...\", \"pageTitle\": \"\\\"Ultraxion Strategy, Loot, Discussions\\\"\", \"outputColName\": \"\\\"Type\\\"\", \"url\": \"\\\"http://www.mmo-champion.com/threads/1026785-Ultra...\", \"wdcFile\": \"\\\"36/1438042989443.69_20150728002309-00296-ip-10-23...\"}", "columns": ["task", "input", "output", "options", "pageTitle", "outputColName", "url", "wdcFile"], "columns_mapping": {"task": "task", "input": "input", "output": "output", "options": "options", "pageTitle": "pageTitle", "outputColName": "outputColName", "url": "url", "wdcFile": "wdcFile"}, "dataset_description": "The UnpredicTable dataset consists of web tables formatted as few-shot tasks for fine-tuning language models to improve their few-shot performance. For more details please see the accompanying dataset card.\n", "dataset_name": "MicPie/unpredictable_mmo-champion-com"}}, "tags": ["task_categories:multiple-choice", "task_categories:question-answering", "task_categories:zero-shot-classification", "task_categories:text2text-generation", "task_categories:table-question-answering", "task_categories:text-generation", "task_categories:text-classification", "task_categories:tabular-classification", "task_ids:multiple-choice-qa", "task_ids:extractive-qa", "task_ids:open-domain-qa", "task_ids:closed-domain-qa", "task_ids:closed-book-qa", "task_ids:open-book-qa", "task_ids:language-modeling", "task_ids:multi-class-classification", "task_ids:natural-language-inference", "task_ids:topic-classification", "task_ids:multi-label-classification", "task_ids:tabular-multi-class-classification", "task_ids:tabular-multi-label-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "language:en"], "is_gated": false}, "BDas/Turkish-Dataset": {"dataset_name": "BDas/Turkish-Dataset", "description": "The dataset, prepared in Turkish, includes 53.000 tests, 53.000 validations and 160600 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "downloads": 18, "configs": {"TurkishData": {"config_name": "TurkishData", "sample_row": "{\"text\": \"\\\"\\\\ufeffevimizde bulunan beko marka klima sogutma v...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The dataset, prepared in Turkish, includes 53.000 tests, 53.000 validations and 160600 train data.\nThe data is composed of customer comments and created from e-commerce sites.\n", "dataset_name": "BDas/Turkish-Dataset"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:tr"], "is_gated": false}, "SocialGrep/one-year-of-tsla-on-reddit": {"dataset_name": "SocialGrep/one-year-of-tsla-on-reddit", "description": "This dataset contains all the posts and comments mentioning the term \"TSLA\", spanning from July 5th, 2021 to July 4th, 2022.", "downloads": 10, "configs": {"posts": {"config_name": "posts", "sample_row": "{\"type\": \"\\\"post\\\"\", \"id\": \"\\\"vrkdvj\\\"\", \"subreddit.id\": \"\\\"4430vb\\\"\", \"subreddit.name\": \"\\\"ultraalgo\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1656977609\", \"permalink\": \"\\\"https://old.reddit.com/r/UltraAlgo/comments/vrkdv...\", \"domain\": \"\\\"pbs.twimg.com\\\"\", \"url\": \"\\\"http://pbs.twimg.com/media/FW2_yU1WQAEiRgC.jpg\\\"...\", \"selftext\": \"\\\"\\\"\", \"title\": \"\\\"$TSLA $1038 net profit across 11 trades. 90% Accu...\", \"score\": \"2\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "domain", "url", "selftext", "title", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "domain": "domain", "url": "url", "selftext": "selftext", "title": "title", "score": "score"}, "dataset_description": "This dataset contains all the posts and comments mentioning the term \"TSLA\", spanning from July 5th, 2021 to July 4th, 2022.\n", "dataset_name": "SocialGrep/one-year-of-tsla-on-reddit"}, "comments": {"config_name": "comments", "sample_row": "{\"type\": \"1\", \"id\": \"\\\"ievql0n\\\"\", \"subreddit.id\": \"\\\"2rndg\\\"\", \"subreddit.name\": \"\\\"valueinvesting\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1656978625\", \"permalink\": \"\\\"https://old.reddit.com/r/ValueInvesting/comments/...\", \"body\": \"\\\"When TSLA was at 1K share price :\\\\n\\\\n8B sales/1B ...\", \"sentiment\": \"0.296\", \"score\": \"1\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "body", "sentiment", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "body": "body", "sentiment": "sentiment", "score": "score"}, "dataset_description": "This dataset contains all the posts and comments mentioning the term \"TSLA\", spanning from July 5th, 2021 to July 4th, 2022.\n", "dataset_name": "SocialGrep/one-year-of-tsla-on-reddit"}}, "tags": ["annotations_creators:lexyr", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "Heriot-WattUniversity/dialog_babi": {"dataset_name": "Heriot-WattUniversity/dialog_babi", "description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.", "downloads": 13, "configs": {"task1-API-calls": {"config_name": "task1-API-calls", "sample_row": "{\"user_turns\": \"[\\\"hi\\\", \\\"can you book a table\\\", \\\"\\\", \\\"i lov...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\"}", "columns": ["user_turns", "system_turns"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task2-API-refine": {"config_name": "task2-API-refine", "sample_row": "{\"user_turns\": \"[\\\"hello\\\", \\\"can you make a restaurant reservation w...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\"}", "columns": ["user_turns", "system_turns"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task3-options": {"config_name": "task3-options", "sample_row": "{\"user_turns\": \"[\\\"good morning\\\", \\\"may i have a table\\\", \\\"\\\"...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\", \"kb_facts.turn_id\": \"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...\", \"kb_facts.fact\": \"[\\\"1 resto_rome_cheap_indian_6stars R_phone resto_r...\"}", "columns": ["user_turns", "system_turns", "kb_facts_turn_id", "kb_facts_fact"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns", "kb_facts.turn_id": "kb_facts_turn_id", "kb_facts.fact": "kb_facts_fact"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task4-phone-address": {"config_name": "task4-phone-address", "sample_row": "{\"user_turns\": \"[\\\"hi\\\", \\\"can you make a restaurant reservation at r...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"great le...\", \"kb_facts.turn_id\": \"[0, 1, 2, 3, 4, 5, 6]\", \"kb_facts.fact\": \"[\\\"1 resto_rome_moderate_spanish_1stars R_phone res...\"}", "columns": ["user_turns", "system_turns", "kb_facts_turn_id", "kb_facts_fact"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns", "kb_facts.turn_id": "kb_facts_turn_id", "kb_facts.fact": "kb_facts_fact"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task5-full-dialogs": {"config_name": "task5-full-dialogs", "sample_row": "{\"user_turns\": \"[\\\"good morning\\\", \\\"i'd like to book a table with it...\", \"system_turns\": \"[\\\"hello what can i help you with today\\\", \\\"i'm on i...\", \"kb_facts.turn_id\": \"[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2...\", \"kb_facts.fact\": \"[\\\"13 resto_madrid_cheap_spanish_1stars R_phone res...\"}", "columns": ["user_turns", "system_turns", "kb_facts_turn_id", "kb_facts_fact"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns", "kb_facts.turn_id": "kb_facts_turn_id", "kb_facts.fact": "kb_facts_fact"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}, "task6-dstc2": {"config_name": "task6-dstc2", "sample_row": "{\"user_turns\": \"[\\\"\\\", \\\"i want a moderately priced restaura...\", \"system_turns\": \"[\\\"Hello , welcome to the Cambridge restaurant syst...\"}", "columns": ["user_turns", "system_turns"], "columns_mapping": {"user_turns": "user_turns", "system_turns": "system_turns"}, "dataset_description": "This section presents the set of 6 tasks for testing end-to-end dialog systems in the restaurant domain described in the paper:\n\nAntoine Bordes, Y-Lan Boureau, Jason Weston, Learning End-to-End Goal-Oriented Dialog, arxiv:1605.07683.\n\nEach task tests a unique aspect of dialog. Tasks are designed to complement the set of 20 bAbI tasks for story understanding of the previous section.\n\nFor each task, there are 1000 dialogs for training, 1000 for development and 1000 for testing. For tasks 1-5, we also include a second test set (with suffix -OOV.txt) that contains dialogs including entities not present in training and development sets.\n\n", "dataset_name": "Heriot-WattUniversity/dialog_babi"}}, "tags": [], "is_gated": false}, "chenz16/curriculum_benchmark": {"dataset_name": "chenz16/curriculum_benchmark", "description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.", "downloads": 63, "configs": {"analytic": {"config_name": "analytic", "sample_row": "{\"premise\": \"\\\"Exactly six trade representatives negotiate a tre...\", \"hypothesis\": \"\\\"Klosnik, Londi, Manley, Poirier, Neri, Osata\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "defeasible": {"config_name": "defeasible", "sample_row": "{\"premise\": \"\\\"PersonX finds a kitten ; PersonX works for a cat ...\", \"hypothesis\": \"\\\"As a result, PersonX feels compassionate\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "boolean": {"config_name": "boolean", "sample_row": "{\"premise\": \"\\\"Dustin, Milton, Louis, Bill, Roland, Dean, Tim, M...\", \"hypothesis\": \"\\\"Bill didn't visit Ecuador\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"contradiction\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "comparative": {"config_name": "comparative", "sample_row": "{\"premise\": \"\\\"Morris is as tall as Derek , Derek is as tall as ...\", \"hypothesis\": \"\\\"Angel is taller than Morris\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"contradiction\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "conditional": {"config_name": "conditional", "sample_row": "{\"premise\": \"\\\"Raul has not visited Moline, Anthony has not visi...\", \"hypothesis\": \"\\\"Louis has not visited Mundelein\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "context_align": {"config_name": "context_align", "sample_row": "{\"premise\": \"\\\"the nails were something\\\"\", \"hypothesis\": \"\\\"'something' here should be 'flatten the ends (of ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "control": {"config_name": "control", "sample_row": "{\"premise\": \"\\\"100 Years of the Western Workplace Conditions in ...\", \"hypothesis\": \"\\\"Improvements in medicine led to workers earning m...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"neutral\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "coreference": {"config_name": "coreference", "sample_row": "{\"premise\": \"\\\"Ian volunteered to eat Dennis's menudo after alre...\", \"hypothesis\": \"\\\"Ian despised eating intestine.\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "cosmoqa": {"config_name": "cosmoqa", "sample_row": "{\"premise\": \"\\\"Good Old War and person L : I saw both of these b...\", \"hypothesis\": \"\\\"This person likes music and likes to see the show...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "counterfactual": {"config_name": "counterfactual", "sample_row": "{\"premise\": \"\\\" if the stimulus bill had become hamstrung ...\", \"hypothesis\": \"\\\" should be \\\\\\\"I don't think any of us---even...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "counting": {"config_name": "counting", "sample_row": "{\"premise\": \"\\\"Troy has visited Djibouti, France, Senegal, Argen...\", \"hypothesis\": \"\\\"Troy has visited less than thirty-five places\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "drop": {"config_name": "drop", "sample_row": "{\"premise\": \"\\\"To start the season, the Lions traveled south to ...\", \"hypothesis\": \"\\\"3 points did the buccaneers need to tie in the fi...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "entailment_tree": {"config_name": "entailment_tree", "sample_row": "{\"premise\": \"\\\"leo is a kind of constellation sent2: the earth r...\", \"hypothesis\": \"\\\"the earth revolving around the sun causes leo to ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "ester": {"config_name": "ester", "sample_row": "{\"premise\": \"\\\"A senior researcher with the State Council, or th...\", \"hypothesis\": \"\\\"promote urbanization\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "hellaswag": {"config_name": "hellaswag", "sample_row": "{\"premise\": \"\\\"[header] How to treat your girlfriend like a prin...\", \"hypothesis\": \"\\\"[substeps] Your girlfriend should be more than an...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "hypernymy": {"config_name": "hypernymy", "sample_row": "{\"premise\": \"\\\"he disliked his neighbors\\\"\", \"hypothesis\": \"\\\", the word or phrase is best characterized as a ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "hyponymy": {"config_name": "hyponymy", "sample_row": "{\"premise\": \"\\\"crochet a bedspread\\\"\", \"hypothesis\": \"\\\"a specific type of crochet is double crochet (or ...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "kg_relations": {"config_name": "kg_relations", "sample_row": "{\"premise\": \"\\\"Diplomats say Assad 's absence from the meeting a...\", \"hypothesis\": \"\\\"Assad was buried in Syria .\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "lexical": {"config_name": "lexical", "sample_row": "{\"premise\": \"\\\"Gonorrhea means the presence of bacteria.\\\"\", \"hypothesis\": \"\\\"Gonorrhea is caused by bacteria.\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "logiqa": {"config_name": "logiqa", "sample_row": "{\"premise\": \"\\\"Some Cantonese don't like chili, so some southern...\", \"hypothesis\": \"\\\"All Cantonese are southerners\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "monotonicity_infer": {"config_name": "monotonicity_infer", "sample_row": "{\"premise\": \"\\\"Tom said that neither parents had ever been to Bo...\", \"hypothesis\": \"\\\"Tom said that neither one of his parents had ever...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "negation": {"config_name": "negation", "sample_row": "{\"premise\": \"\\\"Ted has only visited Bahrain, Terrence has only v...\", \"hypothesis\": \"\\\"Jessie didn't visit Rwanda\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailment\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "ner": {"config_name": "ner", "sample_row": "{\"premise\": \"\\\"The government urged Western and Arab nations to ...\", \"hypothesis\": \"\\\"Western is a person\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "physicalqa": {"config_name": "physicalqa", "sample_row": "{\"premise\": \"\\\"When boiling butter, when it's ready, you can\\\"\", \"hypothesis\": \"\\\"Pour it onto a plate\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"not-entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "puns": {"config_name": "puns", "sample_row": "{\"premise\": \"\\\"Michaela heard that the agreeable tennis umpire w...\", \"hypothesis\": \"\\\"Michaela heard a pun\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "quantifier": {"config_name": "quantifier", "sample_row": "{\"premise\": \"\\\"Everyone has visited every place\\\"\", \"hypothesis\": \"\\\"Floyd didn't visit Johnny\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"neutral\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "sentiment": {"config_name": "sentiment", "sample_row": "{\"premise\": \"\\\"When asked about the product, Eniyah said, 'I had...\", \"hypothesis\": \"\\\"Eniyah liked the product . \\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "socialqa": {"config_name": "socialqa", "sample_row": "{\"premise\": \"\\\"Cameron decided to have a barbecue and gathered h...\", \"hypothesis\": \"\\\"Others would feel like attending\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "spatial": {"config_name": "spatial", "sample_row": "{\"premise\": \"\\\"The triangle is above the pink rectangle. The blu...\", \"hypothesis\": \"\\\"The pink rectangle is to the right of the blue sq...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "sprl": {"config_name": "sprl", "sample_row": "{\"premise\": \"\\\"( Both took further hits yesterday . )\\\"\", \"hypothesis\": \"\\\"Further hits existed during the taking.\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "syntactic_alternation": {"config_name": "syntactic_alternation", "sample_row": "{\"premise\": \"\\\"michael passed the salt to the person across the ...\", \"hypothesis\": \"\\\"michael passed the person across the table the sa...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "syntactic_variation": {"config_name": "syntactic_variation", "sample_row": "{\"premise\": \"\\\"Amrozi accused his brother , whom he called \\\\\\\" th...\", \"hypothesis\": \"\\\"Referring to him as only \\\\\\\" the witness \\\\\\\" , Amro...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "temporal": {"config_name": "temporal", "sample_row": "{\"premise\": \"\\\" I was so nervous for my first day of school. \\\\\\\"W...\", \"hypothesis\": \"\\\" The teacher asked us to stop talking starts afte...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "transitive": {"config_name": "transitive", "sample_row": "{\"premise\": \"\\\"a particular person was n't blessed to have a par...\", \"hypothesis\": \"\\\"that person might or might not have had that thin...\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "verbcorner": {"config_name": "verbcorner", "sample_row": "{\"premise\": \"\\\"Samantha enjoyed the blinch.\\\"\", \"hypothesis\": \"\\\"Something good happened .\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}, "verbnet": {"config_name": "verbnet", "sample_row": "{\"premise\": \"\\\"David constructed a house .\\\"\", \"hypothesis\": \"\\\"David caused the constructing .\\\"\", \"idx\": \"\\\"0\\\"\", \"gold_label\": \"\\\"entailed\\\"\"}", "columns": ["premise", "hypothesis", "idx", "gold_label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "idx": "idx", "gold_label": "gold_label"}, "dataset_description": "We introduce Curriculum as a new format of NLI benchmark for evaluation of broad-coverage linguistic phenomena. \nCurriculum contains a collection of datasets that covers 36 types of major linguistic phenomena and an evaluation procedure \nfor diagnosing how well a language model captures reasoning skills for distinct types of linguistic phenomena. \nWe show that this linguistic-phenomena-driven benchmark can serve as an effective tool for diagnosing \nmodel behavior and verifying model learning quality.\n", "dataset_name": "chenz16/curriculum_benchmark"}}, "tags": [], "is_gated": false}, "biglam/atypical_animacy": {"dataset_name": "biglam/atypical_animacy", "description": "Atypical animacy detection dataset, based on nineteenth-century sentences in English extracted from an open dataset of nineteenth-century books digitized by the British Library (available via https://doi.org/10.21250/db14, British Library Labs, 2014). \nThis dataset contains 598 sentences containing mentions of machines. Each sentence has been annotated according to the animacy and humanness of the machine in the sentence.", "downloads": 20, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"002732647_02_180_7\\\"\", \"sentence\": \"\\\"Poetic RMS OF THE CITY OF MANCHESTEI legends migh...\", \"context\": \"\\\"That there was a Roman camp on Castlefield, with-...\", \"target\": \"\\\"engine\\\"\", \"animacy\": \"0.0\", \"humanness\": \"0.0\", \"offsets\": \"[134, 140]\", \"date\": \"\\\"1891\\\"\"}", "columns": ["id", "sentence", "context", "target", "animacy", "humanness", "offsets", "date"], "columns_mapping": {"id": "id", "sentence": "sentence", "context": "context", "target": "target", "animacy": "animacy", "humanness": "humanness", "offsets": "offsets", "date": "date"}, "dataset_description": "Atypical animacy detection dataset, based on nineteenth-century sentences in English extracted from an open dataset of nineteenth-century books digitized by the British Library (available via https://doi.org/10.21250/db14, British Library Labs, 2014). \nThis dataset contains 598 sentences containing mentions of machines. Each sentence has been annotated according to the animacy and humanness of the machine in the sentence. \n", "dataset_name": "biglam/atypical_animacy"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:intent-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "codeparrot/xlcost-text-to-code": {"dataset_name": "codeparrot/xlcost-text-to-code", "description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "downloads": 634, "configs": {"Python-snippet-level": {"config_name": "Python-snippet-level", "sample_row": "{\"text\": \"\\\"Python3 implementation of the above approach\\\"\", \"code\": \"\\\"def maxPresum ( a , b ) : NEW_LINE\\\"\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Python-program-level": {"config_name": "Python-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"def maxPresum ( a , b ) : NEW_LINE INDENT X = max...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C-snippet-level": {"config_name": "C-snippet-level", "sample_row": "{\"text\": \"\\\"C program for above approach\\\"\", \"code\": \"\\\"#include \\\"\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C-program-level": {"config_name": "C-program-level", "sample_row": "{\"text\": \"\\\"Minimum number of coins having value equal to pow...\", \"code\": \"\\\"#include NEW_LINE void count_setbit ( i...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Java-snippet-level": {"config_name": "Java-snippet-level", "sample_row": "{\"text\": \"\\\"Java Program to implement the above approach\\\"\", \"code\": \"\\\"import java . util . * ; class GFG { static int m...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Java-program-level": {"config_name": "Java-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"import java . util . * ; class GFG { static int m...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Javascript-snippet-level": {"config_name": "Javascript-snippet-level", "sample_row": "{\"text\": \"\\\"Javascript Program to implement the above approac...\", \"code\": \"\\\"function maxPresum ( a , b ) {\\\"\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Javascript-program-level": {"config_name": "Javascript-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"function maxPresum ( a , b ) { let X = Math . max...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Csharp-snippet-level": {"config_name": "Csharp-snippet-level", "sample_row": "{\"text\": \"\\\"C # Program to implement the above approach\\\"\", \"code\": \"\\\"using System ; using System . Collections . Gener...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "Csharp-program-level": {"config_name": "Csharp-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"using System ; using System . Collections . Gener...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C++-snippet-level": {"config_name": "C++-snippet-level", "sample_row": "{\"text\": \"\\\"C ++ Program to implement the above approach\\\"\", \"code\": \"\\\"#include NEW_LINE using namespace...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "C++-program-level": {"config_name": "C++-program-level", "sample_row": "{\"text\": \"\\\"Maximum Prefix Sum possible by merging two given ...\", \"code\": \"\\\"#include NEW_LINE using namespace...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "PHP-snippet-level": {"config_name": "PHP-snippet-level", "sample_row": "{\"text\": \"\\\"Function that returns true if the number represen...\", \"code\": \"\\\"< ? php function isEven ( $ arr , $ n , $ r ) {\\\"...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}, "PHP-program-level": {"config_name": "PHP-program-level", "sample_row": "{\"text\": \"\\\"Check if the number is even or odd whose digits a...\", \"code\": \"\\\"< ? php function isEven ( $ arr , $ n , $ r ) { i...\"}", "columns": ["text", "code"], "columns_mapping": {"text": "text", "code": "code"}, "dataset_description": " XLCoST is a machine learning benchmark dataset that contains fine-grained parallel data in 7 commonly used programming languages (C++, Java, Python, C#, Javascript, PHP, C), and natural language (English).", "dataset_name": "codeparrot/xlcost-text-to-code"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:multilingual", "language:code"], "is_gated": false}, "demelin/moral_stories": {"dataset_name": "demelin/moral_stories", "description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.", "downloads": 299, "configs": {"full": {"config_name": "full", "sample_row": "{\"ID\": \"\\\"37TD41K0AI7TYQGNUFTSCYCNT25SCN\\\"\", \"norm\": \"\\\"It's responsible to keep children safe.\\\"\", \"situation\": \"\\\"Kent was watching his kids playing in the backyar...\", \"intention\": \"\\\"Kent wants to add security to his back yard.\\\"\", \"moral_action\": \"\\\"Kent installs cameras around his yard to look for...\", \"moral_consequence\": \"\\\"Kent's kids feel much safer with the camera's wat...\", \"immoral_action\": \"\\\"Kent installs an electric fence around his yard t...\", \"immoral_consequence\": \"\\\"One of Kent's kids gets shocked by the fence when...\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action-lexical_bias": {"config_name": "cls-action-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action-minimal_pairs": {"config_name": "cls-action-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action-norm_distance": {"config_name": "cls-action-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context-lexical_bias": {"config_name": "cls-action+context-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"It's rude to ditch a date for someone else.\\\"\", \"situation\": \"\\\"Joan is on a first date with Mitch when she gets ...\", \"intention\": \"\\\"Joan wants to have a fun night.\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context-minimal_pairs": {"config_name": "cls-action+context-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"It's good to do activities together with your spo...\", \"situation\": \"\\\"Jenny's husband asks her to go hiking with him li...\", \"intention\": \"\\\"Jenny wants to tell her husband if she'll go hiki...\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context-norm_distance": {"config_name": "cls-action+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context+consequence-lexical_bias": {"config_name": "cls-action+context+consequence-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"It's rude to ditch a date for someone else.\\\"\", \"situation\": \"\\\"Joan is on a first date with Mitch when she gets ...\", \"intention\": \"\\\"Joan wants to have a fun night.\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"Mitch has a great time on his date with Joan and ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context+consequence-minimal_pairs": {"config_name": "cls-action+context+consequence-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"It's good to do activities together with your spo...\", \"situation\": \"\\\"Jenny's husband asks her to go hiking with him li...\", \"intention\": \"\\\"Jenny wants to tell her husband if she'll go hiki...\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"Jenny's husband is happy that they still share ac...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+context+consequence-norm_distance": {"config_name": "cls-action+context+consequence-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+norm-lexical_bias": {"config_name": "cls-action+norm-lexical_bias", "sample_row": "{\"ID\": \"\\\"3K4J6M3CXFR2F6AYF13KHP3W45NAGD1\\\"\", \"norm\": \"\\\"It's rude to ditch a date for someone else.\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Joan ignores the text and focuses on enjoying her...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+norm-minimal_pairs": {"config_name": "cls-action+norm-minimal_pairs", "sample_row": "{\"ID\": \"\\\"37XITHEISX7HXBGAFQTNMO7R7TSRCQ1\\\"\", \"norm\": \"\\\"It's good to do activities together with your spo...\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Jenny agrees to go hiking with her husband and se...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-action+norm-norm_distance": {"config_name": "cls-action+norm-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action-lexical_bias": {"config_name": "cls-consequence+action-lexical_bias", "sample_row": "{\"ID\": \"\\\"39L1G8WVWRP5R6LAO337NULKXIJ1341\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Jacob invites Martin to join him and his friends ...\", \"moral_consequence\": \"\\\"Martin feels included and welcome.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action-minimal_pairs": {"config_name": "cls-consequence+action-minimal_pairs", "sample_row": "{\"ID\": \"\\\"3AAPLD8UCDFLUY4I57CJXU5P86UTHE1\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Keisha decides to grill dinner so her kids can ru...\", \"moral_consequence\": \"\\\"Keisha's kids get exercise and stay healthy.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action-norm_distance": {"config_name": "cls-consequence+action-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action+context-lexical_bias": {"config_name": "cls-consequence+action+context-lexical_bias", "sample_row": "{\"ID\": \"\\\"39L1G8WVWRP5R6LAO337NULKXIJ1341\\\"\", \"norm\": \"\\\"It is kind to be welcoming to a new student\\\"\", \"situation\": \"\\\"It's lunchtime and Jacob is entering the cafeteri...\", \"intention\": \"\\\"Jacob wants to find a table in the cafeteria.\\\"\", \"moral_action\": \"\\\"Jacob invites Martin to join him and his friends ...\", \"moral_consequence\": \"\\\"Martin feels included and welcome.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action+context-minimal_pairs": {"config_name": "cls-consequence+action+context-minimal_pairs", "sample_row": "{\"ID\": \"\\\"3AAPLD8UCDFLUY4I57CJXU5P86UTHE1\\\"\", \"norm\": \"\\\"It's good to have your kids get some fresh air.\\\"...\", \"situation\": \"\\\"Keisha is at home with her two children and is co...\", \"intention\": \"\\\"Keisha wants to keep her kids occupied.\\\"\", \"moral_action\": \"\\\"Keisha decides to grill dinner so her kids can ru...\", \"moral_consequence\": \"\\\"Keisha's kids get exercise and stay healthy.\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "cls-consequence+action+context-norm_distance": {"config_name": "cls-consequence+action+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-action$context-norm_distance": {"config_name": "gen-action$context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-action$context+consequence-norm_distance": {"config_name": "gen-action$context+consequence-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-consequence$action-norm_distance": {"config_name": "gen-consequence$action-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"not specified\\\"\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-consequence$action+context-norm_distance": {"config_name": "gen-consequence$action+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"not specified\\\"\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-norm$actions-norm_distance": {"config_name": "gen-norm$actions-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"not specified\\\"\", \"intention\": \"\\\"not specified\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"Phil decides he and his family need to spend less...\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-norm$actions+context-norm_distance": {"config_name": "gen-norm$actions+context-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"not specified\\\"\", \"immoral_action\": \"\\\"Phil decides he and his family need to spend less...\", \"immoral_consequence\": \"\\\"not specified\\\"\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}, "gen-norm$actions+context+consequences-norm_distance": {"config_name": "gen-norm$actions+context+consequences-norm_distance", "sample_row": "{\"ID\": \"\\\"3X65QVEQI1L6UDLXA6WR0CYS2QZLC51\\\"\", \"norm\": \"\\\"It is good to earn income to support your family\\\"...\", \"situation\": \"\\\"Phil was trying to find ways to help his family f...\", \"intention\": \"\\\"Phil wants to help the bottom line.\\\"\", \"moral_action\": \"\\\"Phil decides that he and his family need to earn ...\", \"moral_consequence\": \"\\\"Phil signs up for Mturk tasks and starts working ...\", \"immoral_action\": \"\\\"Phil decides he and his family need to spend less...\", \"immoral_consequence\": \"\\\"Phil manages to cut the water bill in half before...\", \"label\": \"1\"}", "columns": ["ID", "norm", "situation", "intention", "moral_action", "moral_consequence", "immoral_action", "immoral_consequence", "label"], "columns_mapping": {"ID": "ID", "norm": "norm", "situation": "situation", "intention": "intention", "moral_action": "moral_action", "moral_consequence": "moral_consequence", "immoral_action": "immoral_action", "immoral_consequence": "immoral_consequence", "label": "label"}, "dataset_description": "Moral Stories is a crowd-sourced dataset of structured, branching narratives for the study of grounded, goal-oriented \nsocial reasoning. For detailed information, see https://aclanthology.org/2021.emnlp-main.54.pdf.\n", "dataset_name": "demelin/moral_stories"}}, "tags": ["task_categories:multiple-choice", "task_categories:text-generation", "task_categories:text-classification", "task_ids:multiple-choice-qa", "task_ids:language-modeling", "task_ids:text-scoring", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ArthurBaia/squad_v1_pt_br": {"dataset_name": "ArthurBaia/squad_v1_pt_br", "description": "This dataset was translated by Deep Learning Brazil", "downloads": 101, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"title\": \"\\\"University_of_Notre_Dame\\\"\", \"context\": \"\\\"Arquitetonicamente, a escola tem um car\\\\u00e1ter ...\", \"question\": \"\\\"A quem a Virgem Maria supostamente apareceu em 18...\", \"answers.text\": \"[\\\"Santa Bernadette Soubirous\\\"]\", \"answers.answer_start\": \"[533]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "This dataset was translated by Deep Learning Brazil\n", "dataset_name": "ArthurBaia/squad_v1_pt_br"}}, "tags": [], "is_gated": false}, "nbroad/mediasum": {"dataset_name": "nbroad/mediasum", "description": "This large-scale media interview dataset contains 463.6K transcripts with abstractive summaries, \ncollected from interview transcripts and overview / topic descriptions from NPR and CNN.", "downloads": 16, "configs": {"mediasum": {"config_name": "mediasum", "sample_row": "{\"id\": \"\\\"NPR-1\\\"\", \"program\": \"\\\"News & Notes\\\"\", \"date\": \"\\\"2007-11-28\\\"\", \"url\": \"\\\"https://www.npr.org/templates/story/story.php?sto...\", \"title\": \"\\\"Black Actors Give Bible Star Appeal\\\"\", \"summary\": \"\\\"More than 400 black actors, artists and ministers...\", \"utt\": \"[\\\"Now, moving on, Forest Whitaker as Moses, Tisha ...\", \"speaker\": \"[\\\"FARAI CHIDEYA, host\\\", \\\"FARAI CHIDEYA, host\\\", \\\"Mr...\"}", "columns": ["id", "program", "date", "url", "title", "summary", "utt", "speaker"], "columns_mapping": {"id": "id", "program": "program", "date": "date", "url": "url", "title": "title", "summary": "summary", "utt": "utt", "speaker": "speaker"}, "dataset_description": "This large-scale media interview dataset contains 463.6K transcripts with abstractive summaries, \ncollected from interview transcripts and overview / topic descriptions from NPR and CNN.\n", "dataset_name": "nbroad/mediasum"}}, "tags": ["task_categories:summarization", "multilinguality:monolingual", "language:en"], "is_gated": false}, "tner/conll2003": {"dataset_name": "tner/conll2003", "description": "[CoNLL 2003 NER dataset](https://aclanthology.org/W03-0419/)", "downloads": 21, "configs": {"conll2003": {"config_name": "conll2003", "sample_row": "{\"tokens\": \"[\\\"EU\\\", \\\"rejects\\\", \\\"German\\\", \\\"call\\\", \\\"to\\\", \\\"boycott...\", \"tags\": \"[1, 0, 2, 0, 0, 0, 2, 0, 0]\"}", "columns": ["tokens", "tags"], "columns_mapping": {"tokens": "tokens", "tags": "tags"}, "dataset_description": "[CoNLL 2003 NER dataset](https://aclanthology.org/W03-0419/)", "dataset_name": "tner/conll2003"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "multilinguality:monolingual", "language:en"], "is_gated": false}, "tner/wnut2017": {"dataset_name": "tner/wnut2017", "description": "[WNUT 2017 NER dataset](https://aclanthology.org/W17-4418/)", "downloads": 84, "configs": {"wnut2017": {"config_name": "wnut2017", "sample_row": "{\"tokens\": \"[\\\"@paulwalk\\\", \\\"It\\\", \\\"'s\\\", \\\"the\\\", \\\"view\\\", \\\"from\\\", \\\"...\", \"tags\": \"[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 1...\"}", "columns": ["tokens", "tags"], "columns_mapping": {"tokens": "tokens", "tags": "tags"}, "dataset_description": "[WNUT 2017 NER dataset](https://aclanthology.org/W17-4418/)", "dataset_name": "tner/wnut2017"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "multilinguality:monolingual", "language:en"], "is_gated": false}, "tner/bc5cdr": {"dataset_name": "tner/bc5cdr", "description": "[Bio Creative 5 CDR NER dataset](https://academic.oup.com/database/article/doi/10.1093/database/baw032/2630271?login=true)", "downloads": 958, "configs": {"bc5cdr": {"config_name": "bc5cdr", "sample_row": "{\"tokens\": \"[\\\"Naloxone\\\", \\\"reverses\\\", \\\"the\\\", \\\"antihypertensive\\\"...\", \"tags\": \"[1, 0, 0, 0, 0, 0, 1, 0]\"}", "columns": ["tokens", "tags"], "columns_mapping": {"tokens": "tokens", "tags": "tags"}, "dataset_description": "[Bio Creative 5 CDR NER dataset](https://academic.oup.com/database/article/doi/10.1093/database/baw032/2630271?login=true)", "dataset_name": "tner/bc5cdr"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "multilinguality:monolingual", "language:en"], "is_gated": false}, "pyronear/openfire": {"dataset_name": "pyronear/openfire", "description": "OpenFire is an image classification dataset for wildfire detection, collected\nfrom web searches.", "downloads": 131, "configs": {"default": {"config_name": "default", "sample_row": "{\"image_url\": \"\\\"https://get.pxhere.com/photo/cloud-sky-atmosphere...\", \"is_wildfire\": \"false\"}", "columns": ["image_url", "is_wildfire"], "columns_mapping": {"image_url": "image_url", "is_wildfire": "is_wildfire"}, "dataset_description": "OpenFire is an image classification dataset for wildfire detection, collected\nfrom web searches.\n", "dataset_name": "pyronear/openfire"}}, "tags": ["task_categories:image-classification", "annotations_creators:crowdsourced", "source_datasets:original"], "is_gated": false}, "biglam/clmet_3_1": {"dataset_name": "biglam/clmet_3_1", "description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification", "downloads": 46, "configs": {"plain": {"config_name": "plain", "sample_row": "{\"text\": \"\\\"\\\\nA TREATISE Concerning the PRINCIPLES OF Human K...\", \"genre\": \"\\\"Treatise\\\"\", \"subgenre\": \"\\\"treat\\\"\", \"year\": \"\\\"1710\\\"\", \"quarter_cent\": \"\\\"1700-1724\\\"\", \"decade\": \"\\\"1710s\\\"\", \"title\": \"\\\"A treatise concerning the principles of human kno...\", \"author\": \"\\\"Berkeley, George\\\"\", \"notes\": \"\\\"\\\"\", \"comments\": \"\\\"\\\"\", \"period\": \"\\\"1710-1780\\\"\", \"id\": \"\\\"1\\\"\"}", "columns": ["text", "genre", "subgenre", "year", "quarter_cent", "decade", "title", "author", "notes", "comments", "period", "id"], "columns_mapping": {"text": "text", "genre": "genre", "subgenre": "subgenre", "year": "year", "quarter_cent": "quarter_cent", "decade": "decade", "title": "title", "author": "author", "notes": "notes", "comments": "comments", "period": "period", "id": "id"}, "dataset_description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification\n", "dataset_name": "biglam/clmet_3_1"}, "class": {"config_name": "class", "sample_row": "{\"text\": \"[\\\"A\\\", \\\"TREATISE\\\", \\\"Concerning\\\", \\\"the\\\", \\\"PRINCIPLES...\", \"pos_tags\": \"[2, 8, 11, 2, 8, 5, 8, 8, 7, 8, 6, 7, 1, 2, 0, 8, ...\", \"genre\": \"\\\"Treatise\\\"\", \"subgenre\": \"\\\"treat\\\"\", \"year\": \"\\\"1710\\\"\", \"quarter_cent\": \"\\\"1700-1724\\\"\", \"decade\": \"\\\"1710s\\\"\", \"title\": \"\\\"A treatise concerning the principles of human kno...\", \"author\": \"\\\"Berkeley, George\\\"\", \"notes\": \"\\\"\\\"\", \"comments\": \"\\\"\\\"\", \"period\": \"\\\"1710-1780\\\"\", \"id\": \"\\\"1\\\"\"}", "columns": ["text", "pos_tags", "genre", "subgenre", "year", "quarter_cent", "decade", "title", "author", "notes", "comments", "period", "id"], "columns_mapping": {"text": "text", "pos_tags": "pos_tags", "genre": "genre", "subgenre": "subgenre", "year": "year", "quarter_cent": "quarter_cent", "decade": "decade", "title": "title", "author": "author", "notes": "notes", "comments": "comments", "period": "period", "id": "id"}, "dataset_description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification\n", "dataset_name": "biglam/clmet_3_1"}, "pos": {"config_name": "pos", "sample_row": "{\"text\": \"[\\\"A\\\", \\\"TREATISE\\\", \\\"Concerning\\\", \\\"the\\\", \\\"PRINCIPLES...\", \"pos_tags\": \"[2, 10, 28, 2, 12, 5, 12, 12, 38, 10, 16, 38, 18, ...\", \"genre\": \"\\\"Treatise\\\"\", \"subgenre\": \"\\\"treat\\\"\", \"year\": \"\\\"1710\\\"\", \"quarter_cent\": \"\\\"1700-1724\\\"\", \"decade\": \"\\\"1710s\\\"\", \"title\": \"\\\"A treatise concerning the principles of human kno...\", \"author\": \"\\\"Berkeley, George\\\"\", \"notes\": \"\\\"\\\"\", \"comments\": \"\\\"\\\"\", \"period\": \"\\\"1710-1780\\\"\", \"id\": \"\\\"1\\\"\"}", "columns": ["text", "pos_tags", "genre", "subgenre", "year", "quarter_cent", "decade", "title", "author", "notes", "comments", "period", "id"], "columns_mapping": {"text": "text", "pos_tags": "pos_tags", "genre": "genre", "subgenre": "subgenre", "year": "year", "quarter_cent": "quarter_cent", "decade": "decade", "title": "title", "author": "author", "notes": "notes", "comments": "comments", "period": "period", "id": "id"}, "dataset_description": "The Corpus of Late Modern English Texts, version 3.1 (CLMET3.1) has been created by Hendrik De Smet, \nSusanne Flach, Hans-J\u00fcrgen Diller and Jukka Tyrkk\u00f6, as an offshoot of a bigger project developing a database of text \ndescriptors (Diller, De Smet & Tyrkk\u00f6 2011). CLMET3.1 is a principled collection of public domain texts drawn from \nvarious online archiving projects. This dataset can be used for part-of-speech tagging, NER and text classification\n", "dataset_name": "biglam/clmet_3_1"}}, "tags": ["task_categories:text-classification", "task_categories:fill-mask", "task_ids:multi-label-classification", "task_ids:masked-language-modeling", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "breakend/nllb-multi-domain": {"dataset_name": "breakend/nllb-multi-domain", "description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.", "downloads": 29, "configs": {"eng_Latn-ayr_Latn": {"config_name": "eng_Latn-ayr_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_ayr_Latn\": \"\\\"Phisqha alwa pachaw sartapxta ukatx utaj jak\\\\u201...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_ayr_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_ayr_Latn": "sentence_ayr_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-bho_Deva": {"config_name": "eng_Latn-bho_Deva", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_bho_Deva\": \"\\\"\\\\u0939\\\\u092e \\\\u0938\\\\u0941\\\\u092c\\\\u0939 5 \\\\u092c\\\\u0...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_bho_Deva"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_bho_Deva": "sentence_bho_Deva"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-dyu_Latn": {"config_name": "eng_Latn-dyu_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_dyu_Latn\": \"\\\"An wila la s\\\\u0254g\\\\u0254ma fitiri f\\\\u025b ka kil...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_dyu_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_dyu_Latn": "sentence_dyu_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-fur_Latn": {"config_name": "eng_Latn-fur_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_fur_Latn\": \"\\\"Si sin dismots aes 5 di buinore, o vin fat une gj...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_fur_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_fur_Latn": "sentence_fur_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-rus_Cyrl": {"config_name": "eng_Latn-rus_Cyrl", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_rus_Cyrl\": \"\\\"\\\\u041c\\\\u044b \\\\u0432\\\\u0441\\\\u0442\\\\u0430\\\\u043b\\\\u0438...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_rus_Cyrl"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_rus_Cyrl": "sentence_rus_Cyrl"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}, "eng_Latn-wol_Latn": {"config_name": "eng_Latn-wol_Latn", "sample_row": "{\"id\": \"1\", \"domain\": \"\\\"chat\\\"\", \"sentence_eng_Latn\": \"\\\"We got up a five a.m. and took a 3 mile jaunt all...\", \"sentence_wol_Latn\": \"\\\"Jur\\\\u00f3omi waxtu ci suba la\\\\u00f1u jog ba noppi...\"}", "columns": ["id", "domain", "sentence_eng_Latn", "sentence_wol_Latn"], "columns_mapping": {"id": "id", "domain": "domain", "sentence_eng_Latn": "sentence_eng_Latn", "sentence_wol_Latn": "sentence_wol_Latn"}, "dataset_description": "NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences.\n", "dataset_name": "breakend/nllb-multi-domain"}}, "tags": ["annotations_creators:found", "multilinguality:multilingual", "multilinguality:translation", "source_datasets:extended|flores", "language:en", "language:ru", "language:ayr", "language:bho", "language:dyu", "language:fur", "language:wol"], "is_gated": false}, "muibk/wmt19_metrics_task": {"dataset_name": "muibk/wmt19_metrics_task", "description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.", "downloads": 26, "configs": {"de-cs": {"config_name": "de-cs", "sample_row": "{\"translation.de\": \"\\\"Walisische AMs (Mitglieder der Versammlung) sorge...\", \"translation.cs\": \"\\\"Welsh AMS (\\\\u010dlenov\\\\u00e9 shrom\\\\u00e1\\\\u017ed\\\\u...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"de-cs\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u010clenov\\\\u00e9 Vel\\\\u0161sk\\\\u00e9ho n\\\\u00e1rodn...\"}", "columns": ["translation_de", "translation_cs", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.de": "translation_de", "translation.cs": "translation_cs", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "de-en": {"config_name": "de-en", "sample_row": "{\"translation.de\": \"\\\"Sch\\\\u00f6ne M\\\\u00fcnchnerin 2018: Sch\\\\u00f6ne M\\\\u...\", \"translation.en\": \"\\\"Beautiful Munich 2018: Beautiful Munich 2018 in H...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"100.0\", \"wmt-z\": \"0.577333331316636\", \"pair\": \"\\\"de-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"abendzeitung-muenchen.de.213584\\\"\", \"ref\": \"\\\"The Beauty of Munich 2018: the Beauty of Munich 2...\"}", "columns": ["translation_de", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.de": "translation_de", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"translation.de\": \"\\\"Europa-Parteitag der Linken : Kipping: Europa ist...\", \"translation.fr\": \"\\\"Europe-Congr\\\\u00e8s du parti de la Gauche : Kippi...\", \"mt_system\": \"\\\"online-G.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"22.0\", \"wmt-z\": \"-2.6893683822659\", \"pair\": \"\\\"de-fr\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"euelections\\\"\", \"ref\": \"\\\"Kipping au congr\\\\u00e8s de die Linke sur l'Europe...\"}", "columns": ["translation_de", "translation_fr", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.de": "translation_de", "translation.fr": "translation_fr", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-cs": {"config_name": "en-cs", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.cs\": \"\\\"Welsh Ams se boj\\\\u00ed o \\\\\\\"vypad\\\\u00e1 jako ply\\\\u...\", \"mt_system\": \"\\\"online-X.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"44.0\", \"wmt-z\": \"-2.03458212900238\", \"pair\": \"\\\"en-cs\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u010clenov\\\\u00e9 Vel\\\\u0161sk\\\\u00e9ho n\\\\u00e1rodn...\"}", "columns": ["translation_en", "translation_cs", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.cs": "translation_cs", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-de": {"config_name": "en-de", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.de\": \"\\\"Walisische AMs besorgt dar\\\\u00fcber, dass sie \\\\u2...\", \"mt_system\": \"\\\"Microsoft-WMT19-document-level.6808\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"99.0\", \"wmt-z\": \"0.570916127533967\", \"pair\": \"\\\"en-de\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"Walisische Ageordnete sorgen sich \\\\\\\"wie D\\\\u00f6de...\"}", "columns": ["translation_en", "translation_de", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.de": "translation_de", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.fi\": \"\\\"Walesin kansalliskokouksen j\\\\u00e4senet pelk\\\\u00e...\", \"mt_system\": \"\\\"Human\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"en-fi\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"Walesin kansalliskokouksen j\\\\u00e4senet pelk\\\\u00e...\"}", "columns": ["translation_en", "translation_fi", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.fi": "translation_fi", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-gu": {"config_name": "en-gu", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.gu\": \"\\\"\\\\u0ab5\\\\u0ac7\\\\u0ab2\\\\u0acd\\\\u0ab6 \\\\u0a86\\\\u0a82\\\\u0ab8...\", \"mt_system\": \"\\\"UdS-DFKI.6866\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"20.0\", \"wmt-z\": \"-1.13438006642\", \"pair\": \"\\\"en-gu\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u0ab5\\\\u0ac7\\\\u0ab2\\\\u0acd\\\\u0ab8\\\\u0aa8\\\\u0abe \\\\u0a8f...\"}", "columns": ["translation_en", "translation_gu", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.gu": "translation_gu", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-kk": {"config_name": "en-kk", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.kk\": \"\\\"\\\\u0412\\\\u0435\\\\u043b\\\\u044c\\\\u0448 \\\\u0410\\\\u041c\\\\u0441...\", \"mt_system\": \"\\\"NEU.6755\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"18.5\", \"wmt-z\": \"-1.10972245101563\", \"pair\": \"\\\"en-kk\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u0423\\\\u044d\\\\u043b\\\\u0441\\\\u0442\\\\u0456\\\\u04a3 \\\\u0430...\"}", "columns": ["translation_en", "translation_kk", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.kk": "translation_kk", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-lt": {"config_name": "en-lt", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.lt\": \"\\\"Welsh AMs susir\\\\u016bpin\\\\u0119 d\\\\u0117l \\\\u201ei\\\\u...\", \"mt_system\": \"\\\"TartuNLP-c.6510\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"78.0\", \"wmt-z\": \"-0.443156602337598\", \"pair\": \"\\\"en-lt\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"Velso Asambl\\\\u0117jos nariai bijo b\\\\u016bti i\\\\u01...\"}", "columns": ["translation_en", "translation_lt", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.lt": "translation_lt", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.ru\": \"\\\"\\\\u0412\\\\u0430\\\\u043b\\\\u043b\\\\u0438\\\\u0439\\\\u0441\\\\u043a\\\\...\", \"mt_system\": \"\\\"TartuNLP-u.6645\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"en-ru\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u0427\\\\u043b\\\\u0435\\\\u043d\\\\u044b \\\\u041d\\\\u0430\\\\u0446...\"}", "columns": ["translation_en", "translation_ru", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.ru": "translation_ru", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"translation.en\": \"\\\"Welsh AMs worried about 'looking like muppets'\\\"...\", \"translation.zh\": \"\\\"\\\\u5a01\\\\u5c14\\\\u58eb AM \\\\u62c5\\\\u5fc3\\\\u201c\\\\u770b\\\\u8...\", \"mt_system\": \"\\\"Baidu-system.6932\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"50.0\", \"wmt-z\": \"-2.56158435006466\", \"pair\": \"\\\"en-zh\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"bbc.381790\\\"\", \"ref\": \"\\\"\\\\u5a01\\\\u5c14\\\\u58eb AM \\\\u62c5\\\\u5fc3\\\\u201d\\\\u50cf\\\\u8...\"}", "columns": ["translation_en", "translation_zh", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.en": "translation_en", "translation.zh": "translation_zh", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "fi-en": {"config_name": "fi-en", "sample_row": "{\"translation.fi\": \"\\\"Eemeli Kouki johti Hurrikaanin kotivoittoon avauk...\", \"translation.en\": \"\\\"Eemeli A. Hurrikaanin led to victory at home - Sa...\", \"mt_system\": \"\\\"parfda.6526\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"66.0\", \"wmt-z\": \"-0.720709405878421\", \"pair\": \"\\\"fi-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"ess.fi.43771\\\"\", \"ref\": \"\\\"Eemeli Kouki led Hurrikaanit to home victory in t...\"}", "columns": ["translation_fi", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.fi": "translation_fi", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "fr-de": {"config_name": "fr-de", "sample_row": "{\"translation.fr\": \"\\\"Kipping au congr\\\\u00e8s de die Linke sur l'Europe...\", \"translation.de\": \"\\\"Kipping beim Linken Congress on Europe: Europa is...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"null\", \"wmt-z\": \"null\", \"pair\": \"\\\"fr-de\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"euelections\\\"\", \"ref\": \"\\\"Europa-Parteitag der Linken : Kipping: Europa ist...\"}", "columns": ["translation_fr", "translation_de", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.fr": "translation_fr", "translation.de": "translation_de", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "gu-en": {"config_name": "gu-en", "sample_row": "{\"translation.gu\": \"\\\"\\\\u0aaa\\\\u0aa4\\\\u0a82\\\\u0a9c\\\\u0ab2\\\\u0ac0 \\\\u0ab2\\\\u0acb...\", \"translation.en\": \"\\\"This took pata.mjalii nikldyo frog, was super baa...\", \"mt_system\": \"\\\"UdS-DFKI.6861\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"6.0\", \"wmt-z\": \"-1.32251345941323\", \"pair\": \"\\\"gu-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"gu.webdunia.com.113\\\"\", \"ref\": \"\\\"Frog inside Patanjali \\\\u2018Aata\\\\u2019 (flour) pa...\"}", "columns": ["translation_gu", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.gu": "translation_gu", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "kk-en": {"config_name": "kk-en", "sample_row": "{\"translation.kk\": \"\\\"\\\\u0492\\\\u0430\\\\u0436\\\\u0430\\\\u0439\\\\u044b\\\\u043f \\\\u049b...\", \"translation.en\": \"\\\"\\\\u049b\\\\u04b1\\\\u043b\\\\u0430\\\\u049b\\\\u049b\\\\u0430\\\\u043f ...\", \"mt_system\": \"\\\"DBMS-KU_KKEN.6726\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"70.0\", \"wmt-z\": \"-0.649670870333131\", \"pair\": \"\\\"kk-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"egemen.kz.9219\\\"\", \"ref\": \"\\\"Wonderful headphones.\\\"\"}", "columns": ["translation_kk", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.kk": "translation_kk", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "lt-en": {"config_name": "lt-en", "sample_row": "{\"translation.lt\": \"\\\"\\\\\\\"MG Baltic\\\\\\\" byla: naujasis Gustainio advokatas ...\", \"translation.en\": \"\\\"Case of \\\\\\\"MG Baltic\\\\\\\": New Gustainis lawyer says ...\", \"mt_system\": \"\\\"online-A.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"68.0\", \"wmt-z\": \"-0.0612950663855201\", \"pair\": \"\\\"lt-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"delfi.lt.492\\\"\", \"ref\": \"\\\"MG Baltic case: a new advocate of Mr. Gustainis s...\"}", "columns": ["translation_lt", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.lt": "translation_lt", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "ru-en": {"config_name": "ru-en", "sample_row": "{\"translation.ru\": \"\\\"\\\\u041d\\\\u0430\\\\u0437\\\\u0432\\\\u0430\\\\u043d\\\\u043e \\\\u0447...\", \"translation.en\": \"\\\"The number of recruits from Ukraine preparing to ...\", \"mt_system\": \"\\\"online-Y.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"97.0\", \"wmt-z\": \"0.33353407846393\", \"pair\": \"\\\"ru-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"izvestiya.300303\\\"\", \"ref\": \"\\\"The number of new Ukrainian recruits ready to go ...\"}", "columns": ["translation_ru", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.ru": "translation_ru", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}, "zh-en": {"config_name": "zh-en", "sample_row": "{\"translation.zh\": \"\\\"\\\\u5f20\\\\u5149\\\\u519b\\\\u88ab\\\\u4efb\\\\u547d\\\\u4e3a\\\\u5e7f\\\\...\", \"translation.en\": \"\\\"Zhang Guangjun was appointed vice governor of Gua...\", \"mt_system\": \"\\\"online-G.0\\\"\", \"mqm\": \"null\", \"wmt-raw\": \"100.0\", \"wmt-z\": \"1.0643780388098\", \"pair\": \"\\\"zh-en\\\"\", \"dataset\": \"\\\"wmt19\\\"\", \"sent_id\": \"0\", \"doc_name\": \"\\\"newstest2019\\\"\", \"doc_ref\": \"\\\"chinanews.com.1423\\\"\", \"ref\": \"\\\"Zhang Guangjun was appointed as the Vice Governor...\"}", "columns": ["translation_zh", "translation_en", "mt_system", "mqm", "wmt-raw", "wmt-z", "pair", "dataset", "sent_id", "doc_name", "doc_ref", "ref"], "columns_mapping": {"translation.zh": "translation_zh", "translation.en": "translation_en", "mt_system": "mt_system", "mqm": "mqm", "wmt-raw": "wmt-raw", "wmt-z": "wmt-z", "pair": "pair", "dataset": "dataset", "sent_id": "sent_id", "doc_name": "doc_name", "doc_ref": "doc_ref", "ref": "ref"}, "dataset_description": "This shared task will examine automatic evaluation metrics for machine translation. We will provide\nyou with all of the translations produced in the translation task along with the human reference translations.\nYou will return your automatic metric scores for translations at the system-level and/or at the sentence-level.\nWe will calculate the system-level and sentence-level correlations of your scores with WMT19 human judgements\nonce the manual evaluation has been completed.\n", "dataset_name": "muibk/wmt19_metrics_task"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation"], "is_gated": false}, "tarteel-ai/quranqa": {"dataset_name": "tarteel-ai/quranqa", "description": "The absence of publicly available reusable test collections for Arabic question answering on the Holy Qur\u2019an has impeded the possibility of fairly comparing the performance of systems in that domain. In this article, we introduce AyaTEC, a reusable test collection for verse-based question answering on the Holy Qur\u2019an, which serves as a common experimental testbed for this task. AyaTEC includes 207 questions (with their corresponding 1,762 answers) covering 11 topic categories of the Holy Qur\u2019an that target the information needs of both curious and skeptical users. To the best of our effort, the answers to the questions (each represented as a sequence of verses) in AyaTEC were exhaustive\u2014that is, all qur\u2019anic verses that directly answered the questions were exhaustively extracted and annotated. To facilitate the use of AyaTEC in evaluating the systems designed for that task, we propose several evaluation measures to support the different types of questions and the nature of verse-based answers while integrating the concept of partial matching of answers in the evaluation.", "downloads": 274, "configs": {"shared_task": {"config_name": "shared_task", "sample_row": "{\"pq_id\": \"\\\"2:8-16_364\\\"\", \"passage\": \"\\\"\\\\u0648\\\\u0645\\\\u0646 \\\\u0627\\\\u0644\\\\u0646\\\\u0627\\\\u0633...\", \"surah\": \"2\", \"verses\": \"\\\"8-16\\\"\", \"question\": \"\\\"\\\\u0644\\\\u0645\\\\u0627\\\\u0630\\\\u0627 \\\\u0633\\\\u064a\\\\u064f...\", \"answers.text\": \"[\\\"\\\\u0623\\\\u0648\\\\u0644\\\\u0626\\\\u0643 \\\\u0627\\\\u0644\\\\u063...\", \"answers.answer_start\": \"[504]\"}", "columns": ["pq_id", "passage", "surah", "verses", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"pq_id": "pq_id", "passage": "passage", "surah": "surah", "verses": "verses", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "The absence of publicly available reusable test collections for Arabic question answering on the Holy Qur\u2019an has impeded the possibility of fairly comparing the performance of systems in that domain. In this article, we introduce AyaTEC, a reusable test collection for verse-based question answering on the Holy Qur\u2019an, which serves as a common experimental testbed for this task. AyaTEC includes 207 questions (with their corresponding 1,762 answers) covering 11 topic categories of the Holy Qur\u2019an that target the information needs of both curious and skeptical users. To the best of our effort, the answers to the questions (each represented as a sequence of verses) in AyaTEC were exhaustive\u2014that is, all qur\u2019anic verses that directly answered the questions were exhaustively extracted and annotated. To facilitate the use of AyaTEC in evaluating the systems designed for that task, we propose several evaluation measures to support the different types of questions and the nature of verse-based answers while integrating the concept of partial matching of answers in the evaluation.\n", "dataset_name": "tarteel-ai/quranqa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ar", "quran", "qa"], "is_gated": false}, "biglam/contentious_contexts": {"dataset_name": "biglam/contentious_contexts", "description": "This dataset contains extracts from historical Dutch newspapers which have been containing keywords of potentially contentious words (according to present-day sensibilities). \nThe dataset contains multiple annotations per instance, given the option to quantify agreement scores for annotations. This dataset can be used to track how words and their meanings have changed over time", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"extract_id\": \"\\\"H99\\\"\", \"text\": \"\\\" Hollandsche IJzeren Spoorweg-Maatschappij een vi...\", \"target\": \"\\\"\\\\ud835\\\\ude5c\\\\ud835\\\\ude5a\\\\ud835\\\\ude62\\\\ud835\\\\ude5a\\\\...\", \"annotator_responses_english\": \"[{\\\"id\\\": \\\"unknown_2a\\\", \\\"response\\\": \\\"Not contentious...\", \"annotator_responses_dutch\": \"[{\\\"id\\\": \\\"unknown_2a\\\", \\\"response\\\": \\\"Niet omstreden\\\"...\", \"annotator_suggestions\": \"[{\\\"id\\\": \\\"unknown_2a\\\", \\\"suggestion\\\": \\\"\\\"}, {\\\"id\\\": \\\"u...\"}", "columns": ["extract_id", "text", "target", "annotator_responses_english", "annotator_responses_dutch", "annotator_suggestions"], "columns_mapping": {"extract_id": "extract_id", "text": "text", "target": "target", "annotator_responses_english": "annotator_responses_english", "annotator_responses_dutch": "annotator_responses_dutch", "annotator_suggestions": "annotator_suggestions"}, "dataset_description": "This dataset contains extracts from historical Dutch newspapers which have been containing keywords of potentially contentious words (according to present-day sensibilities). \nThe dataset contains multiple annotations per instance, given the option to quantify agreement scores for annotations. This dataset can be used to track how words and their meanings have changed over time\n", "dataset_name": "biglam/contentious_contexts"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-scoring", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:nl", "newspapers", "historic", "dutch", "problematic", "ConConCor"], "is_gated": false}, "chintagunta85/bc2gm_test": {"dataset_name": "chintagunta85/bc2gm_test", "description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll", "downloads": 10, "configs": {"bc2gm_corpus": {"config_name": "bc2gm_corpus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Comparison\\\", \\\"with\\\", \\\"alkaline\\\", \\\"phosphatases\\\",...\", \"ner_tags\": \"[0, 0, 1, 2, 0, 1, 2, 2]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll\n", "dataset_name": "chintagunta85/bc2gm_test"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "kiddothe2b/contract-nli": {"dataset_name": "kiddothe2b/contract-nli", "description": "ContractNLI: A Benchmark Dataset for ContractNLI in English", "downloads": 38, "configs": {"contractnli_a": {"config_name": "contractnli_a", "sample_row": "{\"premise\": \"\\\"2.3 Provided that the Recipient has a written agr...\", \"hypothesis\": \"\\\"Receiving Party shall not reverse engineer any ob...\", \"label\": \"2\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The ContractNLI dataset consists of Non-Disclosure Agreements (NDAs). All NDAs have been labeled based \non several hypothesis templates as entailment, neutral or contradiction. In this version of the task\n(Task A), the input consists of the relevant part of the document w.r.t. to the hypothesis.\n", "dataset_name": "kiddothe2b/contract-nli"}, "contractnli_b": {"config_name": "contractnli_b", "sample_row": "{\"premise\": \"\\\"NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\\\\nThi...\", \"hypothesis\": \"\\\"Receiving Party shall not reverse engineer any ob...\", \"label\": \"2\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The ContractNLI dataset consists of Non-Disclosure Agreements (NDAs). All NDAs have been labeled based \non several hypothesis templates as entailment, neutral or contradiction. In this version of the task\n(Task B), the input consists of the full document.\n", "dataset_name": "kiddothe2b/contract-nli"}}, "tags": [], "is_gated": false}, "bigscience/xP3all": {"dataset_name": "bigscience/xP3all", "description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.", "downloads": 215, "configs": {"ak": {"config_name": "ak", "sample_row": "{\"inputs\": \"\\\"Kpon\\\\u0254z\\\\u0254\\\\u0301wa\\\\u0301t\\\\u0254\\\\u0301 l\\\\u0...\", \"targets\": \"\\\"Apolisifo\\\\u2184 kae\\\\u025b s\\\\u025b \\\\u025bte s\\\\u025...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ar": {"config_name": "ar", "sample_row": "{\"inputs\": \"\\\"I wonder \\\\u0645\\\\u0627 \\\\u0647\\\\u064a \\\\u0639\\\\u0627\\\\u...\", \"targets\": \"\\\"\\\\u0623\\\\u0630\\\\u0631\\\\u0628\\\\u064a\\\\u062c\\\\u0627\\\\u0646 ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "as": {"config_name": "as", "sample_row": "{\"inputs\": \"\\\"W\\\\u00ebr gi jub gi ci suufu pong bi fukki meetar ...\", \"targets\": \"\\\"\\\\u09a6\\\\u09b2\\\\u0999\\\\u09f0 \\\\u09a4\\\\u09b2\\\\u09f0 \\\\u098...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "bm": {"config_name": "bm", "sample_row": "{\"inputs\": \"\\\"\\\\u0a2e\\\\u0a3e \\\\u0a39\\\\u0a3e\\\\u0a02\\\\u0a17-\\\\u0a15\\\\u0a3...\", \"targets\": \"\\\"A bangelen Hong Kong, Ma ye kalank\\\\u0190 New York...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "bn": {"config_name": "bn", "sample_row": "{\"inputs\": \"\\\"A text in Kinyarwanda: Niba udafite amayinite ya ...\", \"targets\": \"\\\"\\\\u0986\\\\u09aa\\\\u09a8\\\\u09be\\\\u09b0 \\\\u09ab\\\\u09cb\\\\u09a8...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ca": {"config_name": "ca", "sample_row": "{\"inputs\": \"\\\"Selile eza eloko ya moke koleka mpe oyo esalaka, ...\", \"targets\": \"\\\"Una c\\\\u00e8l\\\\u00b7lula \\\\u00e9s la unitat estructu...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "code": {"config_name": "code", "sample_row": "{\"inputs\": \"\\\"A few years ago, Hitagi encountered a giant crab,...\", \"targets\": \"\\\"\\\\n#include \\\\nusing namespace std;\\\\...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "en": {"config_name": "en", "sample_row": "{\"inputs\": \"\\\"Construct a circle with radius r{\\\\\\\\displaystyle r...\", \"targets\": \"\\\"Las coordenadas polares de cualquier punto P{\\\\\\\\di...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "es": {"config_name": "es", "sample_row": "{\"inputs\": \"\\\"Creemos firmemente que la transversalidad entre e...\", \"targets\": \"\\\"Creemos firmemente que las sinergias entre los ca...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "eu": {"config_name": "eu", "sample_row": "{\"inputs\": \"\\\"A text in Bambara: Kabini nin adamaden nana Galap...\", \"targets\": \"\\\"Gizakiak Galapagoetara iritsi zirenetik, ugaztun ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "fon": {"config_name": "fon", "sample_row": "{\"inputs\": \"\\\"Yidlanzana leziqhingi elineziqhingi ezingu-15 ezi...\", \"targets\": \"\\\"Kpl\\\\u00e9kpl\\\\u00e9 t\\\\u0254t\\\\u025bnt\\\\u00ednto 15 w...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "fr": {"config_name": "fr", "sample_row": "{\"inputs\": \"\\\"Text in Kinyarwanda: \\\\u2190 Covid-19: OMS yagiriy...\", \"targets\": \"\\\"Covid-19: l'OMS appelle \\\\u00e0 restreindre l'acc\\\\...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "gu": {"config_name": "gu", "sample_row": "{\"inputs\": \"\\\"W\\\\u2184sii great pyramid no de daa obuo k\\\\u03b5se...\", \"targets\": \"\\\"\\\\u0aae\\\\u0ab9\\\\u0abe\\\\u0aa8 \\\\u0aaa\\\\u0abf\\\\u0ab0\\\\u0abe...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "hi": {"config_name": "hi", "sample_row": "{\"inputs\": \"\\\"Article in Indonesian: Jika Anda tidak ingin perg...\", \"targets\": \"\\\"\\\\u0932\\\\u094b\\\\u0917\\\\u094b\\\\u0902 \\\\u0915\\\\u094b \\\\u091...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "id": {"config_name": "id", "sample_row": "{\"inputs\": \"\\\"Article in Spanish: Podr\\\\u00e1s usar una herramie...\", \"targets\": \"\\\"Buka terminal di dalam komputer. Pasang lshw (jik...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ig": {"config_name": "ig", "sample_row": "{\"inputs\": \"\\\"Text in Tswana: Tshenolo Ntheetsang\\\\nTranslation ...\", \"targets\": \"\\\"Ya mere, na-ege nt\\\\u1ecb ihe ga-ekpughe .\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ki": {"config_name": "ki", "sample_row": "{\"inputs\": \"\\\"Com isso, os jogadores poder\\\\u00e3o controlar a\\\\u...\", \"targets\": \"\\\"\\\\u0169nd\\\\u0169 \\\\u0169cio n\\\\u0129 \\\\u0169r\\\\u0129hot...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "kn": {"config_name": "kn", "sample_row": "{\"inputs\": \"\\\"A text in Tsonga: Xiphiqo xin\\\\u2019wana xa tilens...\", \"targets\": \"\\\"\\\\u0c9c\\\\u0cc2\\\\u0cae\\\\u0ccd \\\\u0cb2\\\\u0cc6\\\\u0ca8\\\\u0ccd...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "lg": {"config_name": "lg", "sample_row": "{\"inputs\": \"\\\"\\\\u0b89\\\\u0bb2\\\\u0b95 \\\\u0b93\\\\u0b9f\\\\u0bc1\\\\u0bae\\\\u0bcd...\", \"targets\": \"\\\"Abalambuzi babaddusi okwetoolola ensi yonna, aba ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ln": {"config_name": "ln", "sample_row": "{\"inputs\": \"\\\"A text in Telugu: \\\\\\\"\\\\\\\"\\\\\\\"\\\\u0c26\\\\u0c3e\\\\u0c28\\\\u0c3f ...\", \"targets\": \"\\\"Soki tolobeli ndenge eza mpasi na kokoma kuna, ba...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ml": {"config_name": "ml", "sample_row": "{\"inputs\": \"\\\"A text in Twi: Binary akontahy\\\\u025bde betumi afa...\", \"targets\": \"\\\"\\\\u0d2c\\\\u0d48\\\\u0d28\\\\u0d31\\\\u0d3f \\\\u0d28\\\\u0d2e\\\\u0d4d...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "mr": {"config_name": "mr", "sample_row": "{\"inputs\": \"\\\"\\\\u06a9\\\\u06cc\\\\u0644\\\\u0634\\\\u06cc\\\\u0645 \\\\u0627\\\\u0648...\", \"targets\": \"\\\"\\\\u0915\\\\u0945\\\\u0932\\\\u094d\\\\u0936\\\\u093f\\\\u092f\\\\u092e ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ne": {"config_name": "ne", "sample_row": "{\"inputs\": \"\\\"Tabax yu bari da\\\\u00f1oo rafet lool te soo koy xo...\", \"targets\": \"\\\"\\\\u0927\\\\u0947\\\\u0930\\\\u0948 \\\\u092d\\\\u0935\\\\u0928\\\\u0939...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "nso": {"config_name": "nso", "sample_row": "{\"inputs\": \"\\\"A text in Twi: 1920s mu no,na \\\\u0254manfo a w\\\\u02...\", \"targets\": \"\\\"Ka nako ya bo 1920, dikgopolo t\\\\u0161eo di hlolag...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ny": {"config_name": "ny", "sample_row": "{\"inputs\": \"\\\"Text in Xhosa: Ukutshata nako kuyingozi.\\\\nTransla...\", \"targets\": \"\\\"Kukwatira kumakhalanso koopsa.\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "or": {"config_name": "or", "sample_row": "{\"inputs\": \"\\\"Ini berarti Anda dapat mengunjungi kota bersejara...\", \"targets\": \"\\\"\\\\u0b0f\\\\u0b39\\\\u0b3e\\\\u0b30 \\\\u0b05\\\\u0b30\\\\u0b4d\\\\u0b25...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "pa": {"config_name": "pa", "sample_row": "{\"inputs\": \"\\\"Abaturage bose b\\\\u2019Umujyi wa Vatikani ni Abany...\", \"targets\": \"\\\"\\\\u0a35\\\\u0a48\\\\u0a1f\\\\u0a40\\\\u0a15\\\\u0a28 \\\\u0a38\\\\u0a3f...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "pt": {"config_name": "pt", "sample_row": "{\"inputs\": \"\\\"Tubuh membutuhkan air agar bisa berfungsi. Jika A...\", \"targets\": \"\\\"Beba muita \\\\u00e1gua Pratique mais atividade f\\\\u0...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "rn": {"config_name": "rn", "sample_row": "{\"inputs\": \"\\\"\\\\u09af\\\\u09a6\\\\u09bf \\\\u0986\\\\u09aa\\\\u09c1\\\\u09a8\\\\u09bf...\", \"targets\": \"\\\"Iyo mwiyumva mwobishobora cane, fata ako karyo ku...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "rw": {"config_name": "rw", "sample_row": "{\"inputs\": \"\\\"Text in Yoruba: L\\\\u1ecdgan ti Apple logo farahan,...\", \"targets\": \"\\\"Usubire kuri flash disck yawe urasanga data zagar...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "sn": {"config_name": "sn", "sample_row": "{\"inputs\": \"\\\"Text in Swahili (individual language): Ikiwa mwaj...\", \"targets\": \"\\\"kana waunoshandira ane kodzero pfuma zvedzidzo uk...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "st": {"config_name": "st", "sample_row": "{\"inputs\": \"\\\"\\\\u0c15\\\\u0c4a\\\\u0c30\\\\u0c3f\\\\u0c2f\\\\u0c30\\\\u0c4d \\\\u0c15...\", \"targets\": \"\\\"Dik\\\\u2019hampani tsa ho tsamaisa dintho di lefuwa...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "sw": {"config_name": "sw", "sample_row": "{\"inputs\": \"\\\"I wonder Mama wa rais wa Japani ni nani?\\\\n\\\\nCan y...\", \"targets\": \"\\\"Vita kati ya Japani na Urusi ya 1905 ilipiganiwa ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ta": {"config_name": "ta", "sample_row": "{\"inputs\": \"\\\"ITokyo kuzoba ilona kuphela idolobha lase-Asia el...\", \"targets\": \"\\\"1964\\\\u0bb2\\\\u0bcd \\\\u0bb5\\\\u0bbf\\\\u0bb3\\\\u0bc8\\\\u0baf\\\\u...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "te": {"config_name": "te", "sample_row": "{\"inputs\": \"\\\"Rumor has it that 1828-1835 \\\\u0c2e\\\\u0c27\\\\u0c4d\\\\u0...\", \"targets\": \"\\\"\\\\u0c32\\\\u0c3e\\\\u0c30\\\\u0c4d\\\\u0c21\\\\u0c41 \\\\u0c35\\\\u0c3f...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "tn": {"config_name": "tn", "sample_row": "{\"inputs\": \"\\\"L'any 1995 se'l va votar com a millor jugador de ...\", \"targets\": \"\\\"Ka ngwaga wa 1995 o ne a tlhophiwa jaaka motshame...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ts": {"config_name": "ts", "sample_row": "{\"inputs\": \"\\\"Text in Kinyarwanda: Mbese mu ihunga ryawe uhagaz...\", \"targets\": \"\\\"ri kwihi ribye ra n'wina ra vutumbelo ke?\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "tum": {"config_name": "tum", "sample_row": "{\"inputs\": \"\\\"W\\\\u011bma s\\\\u025b\\\\u0301d\\\\u00f3 tom\\\\u025b xw\\\\u00e9...\", \"targets\": \"\\\"Makampani gha kuyegha katundu ghakulipirika makol...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "tw": {"config_name": "tw", "sample_row": "{\"inputs\": \"\\\"La compra libre de impuestos aduaneros ofrece una...\", \"targets\": \"\\\"Duty free shopping y\\\\u025b hokwan a wode b\\\\u025bt...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "ur": {"config_name": "ur", "sample_row": "{\"inputs\": \"\\\"Given the below title and summary of an article, ...\", \"targets\": \"\\\"\\\\u0631\\\\u0627\\\\u0648\\\\u0644\\\\u067e\\\\u0646\\\\u0688\\\\u06cc ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "vi": {"config_name": "vi", "sample_row": "{\"inputs\": \"\\\"O vermelho \\\\u00e9 uma cor prim\\\\u00e1ria, e por is...\", \"targets\": \"\\\"Hi\\\\u1ec3u r\\\\u1eb1ng b\\\\u1ea1n kh\\\\u00f4ng th\\\\u1ec3 ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "wo": {"config_name": "wo", "sample_row": "{\"inputs\": \"\\\"Nta gabishwa ry'igihuhusi ca tsunami rirasohorwa,...\", \"targets\": \"\\\"Amul benn \\\\u00e0ddu bu\\\\u00f1u def ci tsunami buy ...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "xh": {"config_name": "xh", "sample_row": "{\"inputs\": \"\\\"Text in Zulu: Umhlanga Guest house opinie, Umhlan...\", \"targets\": \"\\\"Umhlanga Guest house, Umhlanga Rocks\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "yo": {"config_name": "yo", "sample_row": "{\"inputs\": \"\\\"A text in Malayalam: \\\\u0d35\\\\u0d3f\\\\u0d32\\\\u0d2f\\\\u0d...\", \"targets\": \"\\\"W\\\\u1ecd\\\\u0301n ti d\\\\u00e1 \\\\u1ecd\\\\u0300p\\\\u1ecd\\\\u03...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "zh": {"config_name": "zh", "sample_row": "{\"inputs\": \"\\\"\\\\u7537\\\\uff1a\\\\u73b0\\\\u5728\\\\u4e3a\\\\u5927\\\\u5bb6\\\\u4ecb\\\\...\", \"targets\": \"\\\"\\\\u80fd\\\\u6ee1\\\\u8db3\\\\u4e0d\\\\u540c\\\\u8bfb\\\\u8005\\\\u7684\\\\...\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}, "zu": {"config_name": "zu", "sample_row": "{\"inputs\": \"\\\"Text in Ganda: mulinunulibwa awatali ffeeza.\\\\nTra...\", \"targets\": \"\\\"uyonikeza mahhala.\\\"\"}", "columns": ["inputs", "targets"], "columns_mapping": {"inputs": "inputs", "targets": "targets"}, "dataset_description": "xP3 (Crosslingual Public Pool of Prompts) is a collection of prompts & datasets across 46 of languages & 16 NLP tasks. It is used for the training of BLOOMZ and mT0, multilingual language models capable of following human instructions in dozens of languages zero-shot.\n", "dataset_name": "bigscience/xP3all"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:multilingual", "language:ak", "language:ar", "language:as", "language:bm", "language:bn", "language:ca", "language:code", "language:en", "language:es", "language:eu", "language:fon", "language:fr", "language:gu", "language:hi", "language:id", "language:ig", "language:ki", "language:kn", "language:lg", "language:ln", "language:ml", "language:mr", "language:ne", "language:nso", "language:ny", "language:or", "language:pa", "language:pt", "language:rn", "language:rw", "language:sn", "language:st", "language:sw", "language:ta", "language:te", "language:tn", "language:ts", "language:tum", "language:tw", "language:ur", "language:vi", "language:wo", "language:xh", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "allenai/multi_lexsum": {"dataset_name": "allenai/multi_lexsum", "description": "Multi-LexSum is a multi-doc summarization dataset for civil rights litigation lawsuits with summaries of three granularities.", "downloads": 260, "configs": {"v20220616": {"config_name": "v20220616", "sample_row": "{\"id\": \"\\\"EE-AL-0045\\\"\", \"sources\": \"[\\\"Case 1:05-cv-00530-D Document 1-1 Filed 09/19/20...\", \"summary/long\": \"\\\"On September 15, 2005, the Equal Employment Oppor...\", \"summary/short\": \"\\\"Equal Employment Opportunity Commission brought a...\", \"summary/tiny\": \"null\"}", "columns": ["id", "sources", "summary/long", "summary/short", "summary/tiny"], "columns_mapping": {"id": "id", "sources": "sources", "summary/long": "summary/long", "summary/short": "summary/short", "summary/tiny": "summary/tiny"}, "dataset_description": "\nMulti-LexSum is a multi-doc summarization dataset for civil rights litigation lawsuits with summaries of three granularities. \n", "dataset_name": "allenai/multi_lexsum"}, "v20230518": {"config_name": "v20230518", "sample_row": "{\"id\": \"\\\"EE-AL-0045\\\"\", \"sources\": \"[\\\"Case 1:05-cv-00530-D Document 1-1 Filed 09/19/20...\", \"sources_metadata.doc_id\": \"[\\\"EE-AL-0045-0001\\\", \\\"EE-AL-0045-0002\\\", \\\"EE-AL-0045...\", \"sources_metadata.doc_type\": \"[\\\"Complaint\\\", \\\"Complaint\\\", \\\"Settlement Agreement\\\",...\", \"sources_metadata.doc_title\": \"[\\\"Complaint\\\", \\\"Complaint in Intervention\\\", \\\"Consen...\", \"sources_metadata.parser\": \"[\\\"pyxpdf\\\", \\\"pyxpdf\\\", \\\"pyxpdf\\\", \\\"pyxpdf\\\"]\", \"sources_metadata.is_ocr\": \"[true, true, true, false]\", \"sources_metadata.url\": \"[\\\"https://clearinghouse.net/doc/22034\\\", \\\"https://c...\", \"summary/long\": \"\\\"On September 15, 2005, the Equal Employment Oppor...\", \"summary/short\": \"\\\"Equal Employment Opportunity Commission brought a...\", \"summary/tiny\": \"null\", \"case_metadata.case_name\": \"\\\"EEOC v. House of Philadelphia Center, Inc.\\\"\", \"case_metadata.case_type\": \"\\\"Equal Employment\\\"\", \"case_metadata.filing_date\": \"\\\"2005-09-15\\\"\", \"case_metadata.filing_year\": \"\\\"2005\\\"\", \"case_metadata.case_ongoing\": \"\\\"No\\\"\", \"case_metadata.case_ongoing_record_time\": \"\\\"2022-05-19\\\"\", \"case_metadata.closing_year\": \"\\\"2010\\\"\", \"case_metadata.order_start_year\": \"\\\"2007\\\"\", \"case_metadata.order_end_year\": \"\\\"2010\\\"\", \"case_metadata.defendant_payment\": \"\\\"$8,000\\\"\", \"case_metadata.class_action_sought\": \"\\\"No\\\"\", \"case_metadata.class_action_granted\": \"\\\"Not sought\\\"\", \"case_metadata.attorney_orgs\": \"[\\\"EEOC\\\"]\", \"case_metadata.prevailing_party\": \"\\\"Plaintiff\\\"\", \"case_metadata.plaintiff_types\": \"[\\\"Private Plaintiff\\\", \\\"EEOC Plaintiff\\\"]\", \"case_metadata.plaintiff_description\": \"\\\"Equal Employment Opportunity Commission filing on...\", \"case_metadata.constitutional_clauses\": \"[]\", \"case_metadata.causes_of_action\": \"[\\\"Title VII (including PDA), 42 U.S.C. \\\\u00a7 200...\", \"case_metadata.summary_authors\": \"[\\\"22120\\\"]\", \"case_metadata.case_url\": \"\\\"https://clearinghouse.net/case/6817\\\"\"}", "columns": ["id", "sources", "sources_metadata_doc_id", "sources_metadata_doc_type", "sources_metadata_doc_title", "sources_metadata_parser", "sources_metadata_is_ocr", "sources_metadata_url", "summary/long", "summary/short", "summary/tiny", "case_metadata_case_name", "case_metadata_case_type", "case_metadata_filing_date", "case_metadata_filing_year", "case_metadata_case_ongoing", "case_metadata_case_ongoing_record_time", "case_metadata_closing_year", "case_metadata_order_start_year", "case_metadata_order_end_year", "case_metadata_defendant_payment", "case_metadata_class_action_sought", "case_metadata_class_action_granted", "case_metadata_attorney_orgs", "case_metadata_prevailing_party", "case_metadata_plaintiff_types", "case_metadata_plaintiff_description", "case_metadata_constitutional_clauses", "case_metadata_causes_of_action", "case_metadata_summary_authors", "case_metadata_case_url"], "columns_mapping": {"id": "id", "sources": "sources", "sources_metadata.doc_id": "sources_metadata_doc_id", "sources_metadata.doc_type": "sources_metadata_doc_type", "sources_metadata.doc_title": "sources_metadata_doc_title", "sources_metadata.parser": "sources_metadata_parser", "sources_metadata.is_ocr": "sources_metadata_is_ocr", "sources_metadata.url": "sources_metadata_url", "summary/long": "summary/long", "summary/short": "summary/short", "summary/tiny": "summary/tiny", "case_metadata.case_name": "case_metadata_case_name", "case_metadata.case_type": "case_metadata_case_type", "case_metadata.filing_date": "case_metadata_filing_date", "case_metadata.filing_year": "case_metadata_filing_year", "case_metadata.case_ongoing": "case_metadata_case_ongoing", "case_metadata.case_ongoing_record_time": "case_metadata_case_ongoing_record_time", "case_metadata.closing_year": "case_metadata_closing_year", "case_metadata.order_start_year": "case_metadata_order_start_year", "case_metadata.order_end_year": "case_metadata_order_end_year", "case_metadata.defendant_payment": "case_metadata_defendant_payment", "case_metadata.class_action_sought": "case_metadata_class_action_sought", "case_metadata.class_action_granted": "case_metadata_class_action_granted", "case_metadata.attorney_orgs": "case_metadata_attorney_orgs", "case_metadata.prevailing_party": "case_metadata_prevailing_party", "case_metadata.plaintiff_types": "case_metadata_plaintiff_types", "case_metadata.plaintiff_description": "case_metadata_plaintiff_description", "case_metadata.constitutional_clauses": "case_metadata_constitutional_clauses", "case_metadata.causes_of_action": "case_metadata_causes_of_action", "case_metadata.summary_authors": "case_metadata_summary_authors", "case_metadata.case_url": "case_metadata_case_url"}, "dataset_description": "\nMulti-LexSum is a multi-doc summarization dataset for civil rights litigation lawsuits with summaries of three granularities. \n", "dataset_name": "allenai/multi_lexsum"}}, "tags": ["task_categories:summarization", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "tau/sled": {"dataset_name": "tau/sled", "description": "Efficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.", "downloads": 21207, "configs": {"summ_screen_fd": {"config_name": "summ_screen_fd", "sample_row": "{\"id\": \"\\\"fd_Charmed_05x13\\\"\", \"pid\": \"\\\"fd_Charmed_05x13_0\\\"\", \"input\": \"\\\"[Scene: Manor. Paige's room. Paige is there lying...\", \"output\": \"\\\"When residue left from demonic vanquishes builds ...\"}", "columns": ["id", "pid", "input", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nSummScreenFD (Chen et al., 2021) is a summarization dataset in the domain of TV shows (e.g. Friends, Game of Thrones).\nGiven a transcript of a specific episode, the goal is to produce the episode's recap.\nThe original dataset is divided into two complementary subsets, based on the source of its community contributed transcripts. \nFor SCROLLS, we use the ForeverDreaming (FD) subset, as it incorporates 88 different shows, \nmaking it a more diverse alternative to the TV MegaSite (TMS) subset, which has only 10 shows. \nCommunity-authored recaps for the ForeverDreaming transcripts were collected from English Wikipedia and TVMaze.", "dataset_name": "tau/sled"}, "qasper": {"config_name": "qasper", "sample_row": "{\"id\": \"\\\"753990d0b621d390ed58f20c4d9e4f065f0dc672\\\"\", \"pid\": \"\\\"753990d0b621d390ed58f20c4d9e4f065f0dc672_0\\\"\", \"input\": \"\\\"Introduction\\\\nAffective events BIBREF0 are events...\", \"input_prefix\": \"\\\"What is the seed lexicon?\\\"\", \"output\": \"\\\"a vocabulary of positive and negative predicates ...\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nQasper (Dasigi et al., 2021) is a question answering dataset over NLP papers filtered from the Semantic Scholar Open Research Corpus (S2ORC).\nQuestions were written by NLP practitioners after reading only the title and abstract of the papers, \nwhile another set of NLP practitioners annotated the answers given the entire document.\nQasper contains abstractive, extractive, and yes/no questions, as well as unanswerable ones.", "dataset_name": "tau/sled"}, "qmsum": {"config_name": "qmsum", "sample_row": "{\"id\": \"\\\"tr-sq-1\\\"\", \"pid\": \"\\\"tr-sq-1_0\\\"\", \"input\": \"\\\"Project Manager: Yep . Soon as I get this . Okay ...\", \"input_prefix\": \"\\\"How Did Project Manager and User Interface introd...\", \"output\": \"\\\"Project Manager introduced that the prototype inc...\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nQMSum (Zhong et al., 2021) is a query-based summarization dataset, consisting of 232 meetings transcripts from multiple domains. \nThe corpus covers academic group meetings at the International Computer Science Institute and their summaries, industrial product meetings for designing a remote control, \nand committee meetings of the Welsh and Canadian Parliaments, dealing with a variety of public policy issues.\nAnnotators were tasked with writing queries about the broad contents of the meetings, as well as specific questions about certain topics or decisions, \nwhile ensuring that the relevant text for answering each query spans at least 200 words or 10 turns.", "dataset_name": "tau/sled"}, "narrative_qa": {"config_name": "narrative_qa", "sample_row": "{\"id\": \"\\\"39ab35eb8bdecda3cfd79433774fc63c7c699171_0\\\"\", \"pid\": \"\\\"39ab35eb8bdecda3cfd79433774fc63c7c699171_0_0\\\"\", \"input\": \"\\\"Produced by Charles Keller and David Widger\\\\n\\\\n\\\\n...\", \"input_prefix\": \"\\\"What does Sir Nigel come to New York looking for?...\", \"output\": \"\\\"an heiress\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nNarrativeQA (Ko\u010disk\u00fd et al., 2021) is an established question answering dataset over entire books from Project Gutenberg and movie scripts from different websites.\nAnnotators were given summaries of the books and scripts obtained from Wikipedia, and asked to generate question-answer pairs, \nresulting in about 30 questions and answers for each of the 1,567 books and scripts.\nThey were encouraged to use their own words rather then copying, and avoid asking yes/no questions or ones about the cast.\nEach question was then answered by an additional annotator, providing each question with two reference answers (unless both answers are identical)..", "dataset_name": "tau/sled"}, "gov_report": {"config_name": "gov_report", "sample_row": "{\"id\": \"\\\"crs_RL33819\\\"\", \"pid\": \"\\\"crs_RL33819_0\\\"\", \"input\": \"\\\"\\\\tMajor Developments in 2008\\\\n\\\\nOn December 17, 2...\", \"output\": \"\\\"Since the early 1960s, U.S. policy toward Cuba ha...\"}", "columns": ["id", "pid", "input", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\n@inproceedings{huang-etal-2021-efficient,\n title = \"Efficient Attentions for Long Document Summarization\",\n author = \"Huang, Luyang and\n Cao, Shuyang and\n Parulian, Nikolaus and\n Ji, Heng and\n Wang, Lu\",\n booktitle = \"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies\",\n month = jun,\n year = \"2021\",\n address = \"Online\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://aclanthology.org/2021.naacl-main.112\",\n doi = \"10.18653/v1/2021.naacl-main.112\",\n pages = \"1419--1436\",\n abstract = \"The quadratic computational and memory complexities of large Transformers have limited their scalability for long document summarization. In this paper, we propose Hepos, a novel efficient encoder-decoder attention with head-wise positional strides to effectively pinpoint salient information from the source. We further conduct a systematic study of existing efficient self-attentions. Combined with Hepos, we are able to process ten times more tokens than existing models that use full attentions. For evaluation, we present a new dataset, GovReport, with significantly longer documents and summaries. Results show that our models produce significantly higher ROUGE scores than competitive comparisons, including new state-of-the-art results on PubMed. Human evaluation also shows that our models generate more informative summaries with fewer unfaithful errors.\",\n}", "dataset_name": "tau/sled"}, "contract_nli": {"config_name": "contract_nli", "sample_row": "{\"id\": \"\\\"34_nda-11\\\"\", \"pid\": \"\\\"34_nda-11_0\\\"\", \"input\": \"\\\"NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\\\\nThi...\", \"input_prefix\": \"\\\"Receiving Party shall not reverse engineer any ob...\", \"output\": \"\\\"Not mentioned\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n\nContract NLI (Koreeda and Manning, 2021) is a natural language inference dataset in the legal domain.\nGiven a non-disclosure agreement (the premise), the task is to predict whether a particular legal statement (the hypothesis) is entailed, not entailed (neutral), or cannot be entailed (contradiction) from the contract.\nThe NDAs were manually picked after simple filtering from the Electronic Data Gathering, Analysis, and Retrieval system (EDGAR) and Google.\nThe dataset contains a total of 607 contracts and 17 unique hypotheses, which were combined to produce the dataset's 10,319 examples.", "dataset_name": "tau/sled"}, "quality": {"config_name": "quality", "sample_row": "{\"id\": \"\\\"52995_I3M5VUMM_1\\\"\", \"pid\": \"\\\"52995_I3M5VUMM_1_0\\\"\", \"input\": \"\\\"SPACEMAN ON A SPREE\\\\n\\\\n\\\\n\\\\n\\\\n BY MACK REYNOLDS\\\\...\", \"input_prefix\": \"\\\"Why is Si retirement so significant to the Space ...\", \"output\": \"\\\"Training new spacemen is costly and time consumin...\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\n@article{pang2021quality,\n title={{QuALITY}: Question Answering with Long Input Texts, Yes!},\n author={Pang, Richard Yuanzhe and Parrish, Alicia and Joshi, Nitish and Nangia, Nikita and Phang, Jason and Chen, Angelica and Padmakumar, Vishakh and Ma, Johnny and Thompson, Jana and He, He and Bowman, Samuel R.},\n journal={arXiv preprint arXiv:2112.08608},\n year={2021}\n}\n", "dataset_name": "tau/sled"}, "squad": {"config_name": "squad", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"pid\": \"\\\"5733be284776f41900661182_0\\\"\", \"input\": \"\\\"Architecturally, the school has a Catholic charac...\", \"input_prefix\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"output\": \"\\\"Saint Bernadette Soubirous\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n", "dataset_name": "tau/sled"}, "squad_shuffled_distractors": {"config_name": "squad_shuffled_distractors", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"pid\": \"\\\"5733be284776f41900661182_0\\\"\", \"input\": \"\\\"John initially adopted a defensive posture simila...\", \"input_prefix\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"output\": \"\\\"Saint Bernadette Soubirous\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n", "dataset_name": "tau/sled"}, "squad_ordered_distractors": {"config_name": "squad_ordered_distractors", "sample_row": "{\"id\": \"\\\"5733be284776f41900661182\\\"\", \"pid\": \"\\\"5733be284776f41900661182_0\\\"\", \"input\": \"\\\"Architecturally, the school has a Catholic charac...\", \"input_prefix\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"output\": \"\\\"Saint Bernadette Soubirous\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nStanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n", "dataset_name": "tau/sled"}, "hotpotqa": {"config_name": "hotpotqa", "sample_row": "{\"id\": \"\\\"5a7a06935542990198eaf050\\\"\", \"pid\": \"\\\"5a7a06935542990198eaf050_0\\\"\", \"input\": \"\\\"Arthur's Magazine (1844\\\\u20131846) was an America...\", \"input_prefix\": \"\\\"Which magazine was started first Arthur's Magazin...\", \"output\": \"\\\"Arthur's Magazine\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nHotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features:\n(1) the questions require finding and reasoning over multiple supporting documents to answer;\n(2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas;\n(3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions;\n(4) we offer a new type of factoid comparison questions to testQA systems\u2019 ability to extract relevant facts and perform necessary comparison.\n", "dataset_name": "tau/sled"}, "hotpotqa_second_only": {"config_name": "hotpotqa_second_only", "sample_row": "{\"id\": \"\\\"5a7a06935542990198eaf050\\\"\", \"pid\": \"\\\"5a7a06935542990198eaf050_0\\\"\", \"input\": \"\\\"First for Women is a woman's magazine published b...\", \"input_prefix\": \"\\\"Which magazine was started first Arthur's Magazin...\", \"output\": \"\\\"Arthur's Magazine\\\"\"}", "columns": ["id", "pid", "input", "input_prefix", "output"], "columns_mapping": {"id": "id", "pid": "pid", "input": "input", "input_prefix": "input_prefix", "output": "output"}, "dataset_description": "\nEfficient Long-Text Understanding with Short-Text Models.\nOur SLiding-Encoder and Decoder uses any pretrained encoder-decoder model, to independtly encode overlapping chunks of \nthe inputs, and perform fusion-in-decoder to achieve linear-memory requirment for long-range natural language understanding.\n\n\nSCROLLS: Standardized CompaRison Over Long Language Sequences.\nA suite of natural language datasets that require reasoning over long texts.\nhttps://scrolls-benchmark.com/\n\nHotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features:\n(1) the questions require finding and reasoning over multiple supporting documents to answer;\n(2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas;\n(3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions;\n(4) we offer a new type of factoid comparison questions to testQA systems\u2019 ability to extract relevant facts and perform necessary comparison.\n", "dataset_name": "tau/sled"}}, "tags": ["task_categories:question-answering", "task_categories:summarization", "task_categories:text-generation", "task_ids:multiple-choice-qa", "task_ids:natural-language-inference", "language:en", "multi-hop-question-answering", "query-based-summarization", "long-texts"], "is_gated": false}, "NbAiLab/norwegian-paws-x": {"dataset_name": "NbAiLab/norwegian-paws-x", "description": "Norwegian PAWS-X, Bokmaal and Nynorsk machine-translated versions of PAWS-X.\n\nPAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "downloads": 18, "configs": {"nb": {"config_name": "nb", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"I Paris i oktober 1560 m\\\\u00f8tte han hemmelig de...\", \"sentence2\": \"\\\"I oktober 1560 m\\\\u00f8tte han hemmelig den engels...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "Norwegian PAWS-X, Bokmaal and Nynorsk machine-translated versions of PAWS-X.\n\nPAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "NbAiLab/norwegian-paws-x"}, "nn": {"config_name": "nn", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"I Paris i oktober 1560 m\\\\u00f8tte han i l\\\\u00f8yn...\", \"sentence2\": \"\\\"I oktober 1560 m\\\\u00f8tte han i l\\\\u00f8ynd den en...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "Norwegian PAWS-X, Bokmaal and Nynorsk machine-translated versions of PAWS-X.\n\nPAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "NbAiLab/norwegian-paws-x"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:extended|other-paws", "language:nb", "language:nn"], "is_gated": false}, "jakartaresearch/google-play-review": {"dataset_name": "jakartaresearch/google-play-review", "description": "This dataset is built as a playground for beginner to make a use case for creating sentiment analysis model.", "downloads": 142, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\" Halo\\\\n blibli. Sedikit saran untuk gratis ongkir...\", \"label\": \"\\\"pos\\\"\", \"stars\": \"4\"}", "columns": ["text", "label", "stars"], "columns_mapping": {"text": "text", "label": "label", "stars": "stars"}, "dataset_description": "This dataset is built as a playground for beginner to make a use case for creating sentiment analysis model.\n", "dataset_name": "jakartaresearch/google-play-review"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:id", "sentiment", "google-play", "indonesian"], "is_gated": false}, "jakartaresearch/news-title-gen": {"dataset_name": "jakartaresearch/news-title-gen", "description": "This dataset is built for generating text for news title.", "downloads": 23, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"Muncul Temuan Baru, Virus Corona Berasal dari Lab...\", \"link\": \"\\\"https://www.tribunnews.com/topic/virus-corona\\\"\", \"date\": \"\\\"2020-02-21\\\"\"}", "columns": ["title", "link", "date"], "columns_mapping": {"title": "title", "link": "link", "date": "date"}, "dataset_description": "This dataset is built for generating text for news title.\n", "dataset_name": "jakartaresearch/news-title-gen"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:id", "newspapers", "title", "news"], "is_gated": false}, "m3/multi_domain_document_classification": {"dataset_name": "m3/multi_domain_document_classification", "description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "downloads": 23, "configs": {"chemprot": {"config_name": "chemprot", "sample_row": "{\"text\": \"\\\"<< Epidermal growth factor receptor >> inhibitors...\", \"label\": \"8\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "citation_intent": {"config_name": "citation_intent", "sample_row": "{\"text\": \"\\\"Thus , over the past few years , along with advan...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "hyperpartisan_news": {"config_name": "hyperpartisan_news", "sample_row": "{\"text\": \"\\\"As seen on The Five Police Group Boycotts Ben &am...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "rct_sample": {"config_name": "rct_sample", "sample_row": "{\"text\": \"\\\"Use of the mobile application was greater than in...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "sciie": {"config_name": "sciie", "sample_row": "{\"text\": \"\\\"The agreement in question involves number in [[ n...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "amcd": {"config_name": "amcd", "sample_row": "{\"text\": \"\\\"It has a modern look, and doesn't take up that mu...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "yelp_review": {"config_name": "yelp_review", "sample_row": "{\"text\": \"\\\"Everything was perfect--the service, the timeline...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "tweet_eval_irony": {"config_name": "tweet_eval_irony", "sample_row": "{\"text\": \"\\\"seeing ppl walking w/ crutches makes me really ex...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "tweet_eval_hate": {"config_name": "tweet_eval_hate", "sample_row": "{\"text\": \"\\\"@user nice new signage. Are you not concerned by ...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}, "tweet_eval_emotion": {"config_name": "tweet_eval_emotion", "sample_row": "{\"text\": \"\\\"\\\\u201cWorry is a down payment on a problem you ma...\", \"label\": \"2\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Multi domain document classification dataset used in [https://arxiv.org/pdf/2004.10964.pdf](https://arxiv.org/pdf/2004.10964.pdf)", "dataset_name": "m3/multi_domain_document_classification"}}, "tags": [], "is_gated": false}, "jakartaresearch/semeval-absa": {"dataset_name": "jakartaresearch/semeval-absa", "description": "This dataset is built as a playground for aspect-based sentiment analysis.", "downloads": 90, "configs": {"laptop": {"config_name": "laptop", "sample_row": "{\"id\": \"\\\"2339\\\"\", \"text\": \"\\\"I charge it at night and skip taking the cord wit...\", \"aspects.term\": \"[\\\"cord\\\", \\\"battery life\\\"]\", \"aspects.polarity\": \"[\\\"neutral\\\", \\\"positive\\\"]\", \"aspects.from\": \"[41, 74]\", \"aspects.to\": \"[45, 86]\"}", "columns": ["id", "text", "aspects_term", "aspects_polarity", "aspects_from", "aspects_to"], "columns_mapping": {"id": "id", "text": "text", "aspects.term": "aspects_term", "aspects.polarity": "aspects_polarity", "aspects.from": "aspects_from", "aspects.to": "aspects_to"}, "dataset_description": "This dataset is built as a playground for aspect-based sentiment analysis.\n", "dataset_name": "jakartaresearch/semeval-absa"}, "restaurant": {"config_name": "restaurant", "sample_row": "{\"id\": \"\\\"3121\\\"\", \"text\": \"\\\"But the staff was so horrible to us.\\\"\", \"aspects.term\": \"[\\\"staff\\\"]\", \"aspects.polarity\": \"[\\\"negative\\\"]\", \"aspects.from\": \"[8]\", \"aspects.to\": \"[13]\", \"category.category\": \"[\\\"service\\\"]\", \"category.polarity\": \"[\\\"negative\\\"]\"}", "columns": ["id", "text", "aspects_term", "aspects_polarity", "aspects_from", "aspects_to", "category_category", "category_polarity"], "columns_mapping": {"id": "id", "text": "text", "aspects.term": "aspects_term", "aspects.polarity": "aspects_polarity", "aspects.from": "aspects_from", "aspects.to": "aspects_to", "category.category": "category_category", "category.polarity": "category_polarity"}, "dataset_description": "This dataset is built as a playground for aspect-based sentiment analysis.\n", "dataset_name": "jakartaresearch/semeval-absa"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "aspect-based-sentiment-analysis", "semeval", "semeval2015"], "is_gated": false}, "jonathanli/echr": {"dataset_name": "jonathanli/echr", "description": "The ECHR Cases dataset is designed for experimentation of neural judgment prediction, as in the original 2019 ACL paper \"Neural Legal Judgment Prediction in English\".", "downloads": 23, "configs": {"non-anon": {"config_name": "non-anon", "sample_row": "{\"itemid\": \"\\\"001-60714\\\"\", \"languageisocode\": \"\\\"ENG\\\"\", \"respondent\": \"\\\"FIN\\\"\", \"branch\": \"\\\"CHAMBER\\\"\", \"date\": \"2002\", \"docname\": \"\\\"CASE OF PIETILAINEN v. FINLAND\\\"\", \"importance\": \"4\", \"conclusion\": \"\\\"Violation of Art. 6-1;Non-pecuniary damage - fina...\", \"judges\": \"\\\"Nicolas Bratza\\\"\", \"text\": \"[\\\"The applicant was born in 1943 and lives in Lauk...\", \"violated_articles\": \"[\\\"6\\\"]\", \"violated_paragraphs\": \"[\\\"6-1\\\"]\", \"violated_bulletpoints\": \"[]\", \"non_violated_articles\": \"[]\", \"non_violated_paragraphs\": \"[]\", \"non_violated_bulletpoints\": \"[]\", \"violated\": \"true\"}", "columns": ["itemid", "languageisocode", "respondent", "branch", "date", "docname", "importance", "conclusion", "judges", "text", "violated_articles", "violated_paragraphs", "violated_bulletpoints", "non_violated_articles", "non_violated_paragraphs", "non_violated_bulletpoints", "violated"], "columns_mapping": {"itemid": "itemid", "languageisocode": "languageisocode", "respondent": "respondent", "branch": "branch", "date": "date", "docname": "docname", "importance": "importance", "conclusion": "conclusion", "judges": "judges", "text": "text", "violated_articles": "violated_articles", "violated_paragraphs": "violated_paragraphs", "violated_bulletpoints": "violated_bulletpoints", "non_violated_articles": "non_violated_articles", "non_violated_paragraphs": "non_violated_paragraphs", "non_violated_bulletpoints": "non_violated_bulletpoints", "violated": "violated"}, "dataset_description": "The ECHR Cases dataset is designed for experimentation of neural judgment prediction, as in the original 2019 ACL paper \"Neural Legal Judgment Prediction in English\".\n", "dataset_name": "jonathanli/echr"}, "anon": {"config_name": "anon", "sample_row": "{\"itemid\": \"\\\"001-60714\\\"\", \"languageisocode\": \"\\\"ENG\\\"\", \"respondent\": \"\\\"FIN\\\"\", \"branch\": \"\\\"CHAMBER\\\"\", \"date\": \"2002\", \"docname\": \"\\\"CASE OF PIETILAINEN v. FINLAND\\\"\", \"importance\": \"4\", \"conclusion\": \"\\\"Violation of Art. 6-1;Non-pecuniary damage - fina...\", \"judges\": \"\\\"Nicolas Bratza\\\"\", \"text\": \"[\\\"The applicant was born in DATE and lives in GPE ...\", \"violated_articles\": \"[\\\"6\\\"]\", \"violated_paragraphs\": \"[\\\"6-1\\\"]\", \"violated_bulletpoints\": \"[]\", \"non_violated_articles\": \"[]\", \"non_violated_paragraphs\": \"[]\", \"non_violated_bulletpoints\": \"[]\", \"violated\": \"true\"}", "columns": ["itemid", "languageisocode", "respondent", "branch", "date", "docname", "importance", "conclusion", "judges", "text", "violated_articles", "violated_paragraphs", "violated_bulletpoints", "non_violated_articles", "non_violated_paragraphs", "non_violated_bulletpoints", "violated"], "columns_mapping": {"itemid": "itemid", "languageisocode": "languageisocode", "respondent": "respondent", "branch": "branch", "date": "date", "docname": "docname", "importance": "importance", "conclusion": "conclusion", "judges": "judges", "text": "text", "violated_articles": "violated_articles", "violated_paragraphs": "violated_paragraphs", "violated_bulletpoints": "violated_bulletpoints", "non_violated_articles": "non_violated_articles", "non_violated_paragraphs": "non_violated_paragraphs", "non_violated_bulletpoints": "non_violated_bulletpoints", "violated": "violated"}, "dataset_description": "The ECHR Cases dataset is designed for experimentation of neural judgment prediction, as in the original 2019 ACL paper \"Neural Legal Judgment Prediction in English\".\n", "dataset_name": "jonathanli/echr"}}, "tags": [], "is_gated": false}, "cjvt/sentinews": {"dataset_name": "cjvt/sentinews", "description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).", "downloads": 47, "configs": {"document_level": {"config_name": "document_level", "sample_row": "{\"nid\": \"1\", \"content\": \"\\\"Evropska komisija mora narediti analizo vzrokov r...\", \"sentiment\": \"\\\"neutral\\\"\"}", "columns": ["nid", "content", "sentiment"], "columns_mapping": {"nid": "nid", "content": "content", "sentiment": "sentiment"}, "dataset_description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).\n", "dataset_name": "cjvt/sentinews"}, "paragraph_level": {"config_name": "paragraph_level", "sample_row": "{\"nid\": \"1\", \"content\": \"\\\"Evropska komisija mora narediti analizo vzrokov r...\", \"sentiment\": \"\\\"neutral\\\"\", \"pid\": \"1\"}", "columns": ["nid", "content", "sentiment", "pid"], "columns_mapping": {"nid": "nid", "content": "content", "sentiment": "sentiment", "pid": "pid"}, "dataset_description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).\n", "dataset_name": "cjvt/sentinews"}, "sentence_level": {"config_name": "sentence_level", "sample_row": "{\"nid\": \"1\", \"content\": \"\\\"Evropska komisija mora narediti analizo vzrokov r...\", \"sentiment\": \"\\\"neutral\\\"\", \"pid\": \"1\", \"sid\": \"1\"}", "columns": ["nid", "content", "sentiment", "pid", "sid"], "columns_mapping": {"nid": "nid", "content": "content", "sentiment": "sentiment", "pid": "pid", "sid": "sid"}, "dataset_description": "SentiNews is a Slovenian sentiment classification dataset, consisting of news articles manually annotated with their \nsentiment by between 2 and 6 annotators. The news articles contain political, business, economic and financial content \nfrom the Slovenian news portals 24ur, Dnevnik, Finance, Rtvslo, and \u017durnal24. The texts were annotated using the \nfive-level Lickert scale (1 \u2013 very negative, 2 \u2013 negative, 3 \u2013 neutral, 4 \u2013 positive, and 5 \u2013 very positive) on three \nlevels of granularity, i.e. on the document, paragraph, and sentence level. The final sentiment is determined using \nthe following criterion: negative (if average of scores \u2264 2.4); neutral (if average of scores is between 2.4 and 3.6); \npositive (average of annotated scores \u2265 3.6).\n", "dataset_name": "cjvt/sentinews"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:sl", "slovenian sentiment", "news articles"], "is_gated": false}, "jakartaresearch/indo-movie-subtitle": {"dataset_name": "jakartaresearch/indo-movie-subtitle", "description": "This dataset is built as a playground for analyzing text on movie subtitle", "downloads": 23, "configs": {"default": {"config_name": "default", "sample_row": "{\"movie_title\": \"\\\"Bank.Robbers.The.Last.Great.Heist\\\"\", \"order\": \"\\\"5\\\"\", \"duration\": \"\\\"00:00:42,583 --> 00:00:46,375\\\"\", \"text\": \"\\\"adalah perilaku yang dinilai\\\\noleh hati nuranimu....\"}", "columns": ["movie_title", "order", "duration", "text"], "columns_mapping": {"movie_title": "movie_title", "order": "order", "duration": "duration", "text": "text"}, "dataset_description": "This dataset is built as a playground for analyzing text on movie subtitle\n", "dataset_name": "jakartaresearch/indo-movie-subtitle"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:id", "movie", "subtitle", "indonesian"], "is_gated": false}, "yhavinga/cnn_dailymail_dutch": {"dataset_name": "yhavinga/cnn_dailymail_dutch", "description": "CNN/DailyMail non-anonymized summarization dataset, translated to Dutch with ccmatrix.\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary", "downloads": 31, "configs": {"3.0.0": {"config_name": "3.0.0", "sample_row": "{\"article\": \"\\\"LONDEN, Engeland (Reuters) - Harry Potter-ster Da...\", \"highlights\": \"\\\"Harry Potter-ster Daniel Radcliffe krijgt \\\\u00a3 ...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset, translated to Dutch with ccmatrix.\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "yhavinga/cnn_dailymail_dutch"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:nl"], "is_gated": false}, "SLPL/naab": {"dataset_name": "SLPL/naab", "description": "Huge corpora of textual data are always known to be a crucial need for training deep models such as transformer-based ones. This issue is emerging more in lower resource languages - like Farsi. We propose naab, the biggest cleaned and ready-to-use open-source textual corpus in Farsi. It contains about 130GB of data, 250 million paragraphs, and 15 billion words. The project name is derived from the Farsi word \u0646\u0627\u0628 which means pure and high-grade.", "downloads": 64, "configs": {"all": {"config_name": "all", "sample_row": "{\"text\": \"\\\" \\\\u062a\\\\u0648\\\\u06cc \\\\u0628\\\\u0633\\\\u0627\\\\u0637\\\\u063...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Huge corpora of textual data are always known to be a crucial need for training deep models such as transformer-based ones. This issue is emerging more in lower resource languages - like Farsi. We propose naab, the biggest cleaned and ready-to-use open-source textual corpus in Farsi. It contains about 130GB of data, 250 million paragraphs, and 15 billion words. The project name is derived from the Farsi word \u0646\u0627\u0628 which means pure and high-grade.\n", "dataset_name": "SLPL/naab"}}, "tags": ["task_categories:fill-mask", "task_categories:text-generation", "task_ids:language-modeling", "task_ids:masked-language-modeling", "multilinguality:monolingual", "language:fa"], "is_gated": false}, "jakartaresearch/inglish": {"dataset_name": "jakartaresearch/inglish", "description": "This dataset is built as a playground for beginner to make a translation model for Indonesian and English.", "downloads": 158, "configs": {"default": {"config_name": "default", "sample_row": "{\"english\": \"\\\"Amrozi accused his brother, whom he called \\\\\\\"the ...\", \"indonesian\": \"\\\"Amrozi menuduh saudaranya, yang dia sebut \\\\\\\"saksi...\"}", "columns": ["english", "indonesian"], "columns_mapping": {"english": "english", "indonesian": "indonesian"}, "dataset_description": "This dataset is built as a playground for beginner to make a translation model for Indonesian and English.\n", "dataset_name": "jakartaresearch/inglish"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "multilinguality:translation", "source_datasets:original", "language:id", "language:en", "indonesian", "english", "translation"], "is_gated": false}, "yhavinga/xsum_dutch": {"dataset_name": "yhavinga/xsum_dutch", "description": "Extreme Summarization (XSum) Dataset.\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.", "downloads": 111, "configs": {"1.0.0": {"config_name": "1.0.0", "sample_row": "{\"document\": \"\\\"De volledige kosten van de schade in Newton Stewa...\", \"summary\": \"\\\"Opruimingsoperaties worden voortgezet in de Schot...\", \"id\": \"\\\"35232142\\\"\"}", "columns": ["document", "summary", "id"], "columns_mapping": {"document": "document", "summary": "summary", "id": "id"}, "dataset_description": "\nExtreme Summarization (XSum) Dataset.\nThere are three features:\n - document: Input news article.\n - summary: One sentence summary of the article.\n - id: BBC ID of the article.\n\n", "dataset_name": "yhavinga/xsum_dutch"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "language:nl"], "is_gated": false}, "RCC-MSU/collection3": {"dataset_name": "RCC-MSU/collection3", "description": "Collection3 is a Russian dataset for named entity recognition annotated with LOC (location), PER (person), and ORG (organization) tags.\n\nDataset is based on collection Persons-1000 originally containing 1000 news documents labeled only with names of persons.\nAdditional labels were added by Valerie Mozharova and Natalia Loukachevitch.\nConversion to the IOB2 format and splitting into train, validation and test sets was done by DeepPavlov team.\n\nFor more details see https://ieeexplore.ieee.org/document/7584769 and http://labinform.ru/pub/named_entities/index.htm", "downloads": 92, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u0414\\\\u043e\\\\u043f\\\\u043e\\\\u043b\\\\u043d\\\\u0435\\\\u043d...\", \"ner_tags\": \"[0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Collection3 is a Russian dataset for named entity recognition annotated with LOC (location), PER (person), and ORG (organization) tags.\n\nDataset is based on collection Persons-1000 originally containing 1000 news documents labeled only with names of persons.\nAdditional labels were added by Valerie Mozharova and Natalia Loukachevitch.\nConversion to the IOB2 format and splitting into train, validation and test sets was done by DeepPavlov team.\n\nFor more details see https://ieeexplore.ieee.org/document/7584769 and http://labinform.ru/pub/named_entities/index.htm\n", "dataset_name": "RCC-MSU/collection3"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:other", "multilinguality:monolingual", "language:ru"], "is_gated": false}, "OxAISH-AL-LLM/wiki_toxic": {"dataset_name": "OxAISH-AL-LLM/wiki_toxic", "description": "Jigsaw Toxic Comment Challenge dataset. This dataset was the basis of a Kaggle competition run by Jigsaw", "downloads": 386, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"794c30aff0931384\\\"\", \"comment_text\": \"\\\"And that's not a personal attack^^ ?\\\"\", \"label\": \"0\"}", "columns": ["id", "comment_text", "label"], "columns_mapping": {"id": "id", "comment_text": "comment_text", "label": "label"}, "dataset_description": "Jigsaw Toxic Comment Challenge dataset. This dataset was the basis of a Kaggle competition run by Jigsaw\n", "dataset_name": "OxAISH-AL-LLM/wiki_toxic"}}, "tags": ["task_categories:text-classification", "task_ids:hate-speech-detection", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "wikipedia", "toxicity", "toxic comments"], "is_gated": false}, "BDas/ArabicNLPDataset": {"dataset_name": "BDas/ArabicNLPDataset", "description": "The dataset, prepared in Arabic, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "downloads": 22, "configs": {"ArabicData": {"config_name": "ArabicData", "sample_row": "{\"text\": \"\\\"\\\\ufeff\\\\u062d\\\\u062f\\\\u064a\\\\u062f \\\\u062e\\\\u0641\\\\u064a...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The dataset, prepared in Arabic, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.\n", "dataset_name": "BDas/ArabicNLPDataset"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "BDas/EnglishNLPDataset": {"dataset_name": "BDas/EnglishNLPDataset", "description": "The dataset, prepared in English, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.", "downloads": 11, "configs": {"EnglishData": {"config_name": "EnglishData", "sample_row": "{\"text\": \"\\\"my fav\\\"\", \"label\": \"2\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The dataset, prepared in English, includes 10.000 tests, 10.000 validations and 80000 train data.\nThe data is composed of customer comments and created from e-commerce sites.\n", "dataset_name": "BDas/EnglishNLPDataset"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "alexandrainst/scandi-qa": {"dataset_name": "alexandrainst/scandi-qa", "description": "ScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.", "downloads": 13, "configs": {"da": {"config_name": "da", "sample_row": "{\"id\": \"\\\"6277735658261425592\\\"\", \"question\": \"\\\"Hvor stammer udr\\\\u00e5bet great scott fra?\\\"\", \"answers.text\": \"[\\\"\\\"]\", \"answers.answer_start\": \"[-1]\", \"context\": \"\\\"Great Scott! er en indskydelse af overraskelse, f...\", \"answers_en.text\": \"[\\\"\\\"]\", \"answers_en.answer_start\": \"[-1]\", \"context_en\": \"\\\"Great Scott! is an interjection of surprise, amaz...\", \"title_en\": \"\\\"Great Scott\\\"\"}", "columns": ["id", "question", "answers_text", "answers_answer_start", "context", "answers_en_text", "answers_en_answer_start", "context_en", "title_en"], "columns_mapping": {"id": "id", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "context": "context", "answers_en.text": "answers_en_text", "answers_en.answer_start": "answers_en_answer_start", "context_en": "context_en", "title_en": "title_en"}, "dataset_description": "\nScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.\n", "dataset_name": "alexandrainst/scandi-qa"}, "sv": {"config_name": "sv", "sample_row": "{\"id\": \"\\\"6277735658261425592\\\"\", \"question\": \"\\\"Var kommer frasen great scott fr\\\\u00e5n?\\\"\", \"answers.text\": \"[\\\"\\\"]\", \"answers.answer_start\": \"[-1]\", \"context\": \"\\\"Great Scott! \\\\u00e4r en interjektion av \\\\u00f6ver...\", \"answers_en.text\": \"[\\\"\\\"]\", \"answers_en.answer_start\": \"[-1]\", \"context_en\": \"\\\"Great Scott! is an interjection of surprise, amaz...\", \"title_en\": \"\\\"Great Scott\\\"\"}", "columns": ["id", "question", "answers_text", "answers_answer_start", "context", "answers_en_text", "answers_en_answer_start", "context_en", "title_en"], "columns_mapping": {"id": "id", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "context": "context", "answers_en.text": "answers_en_text", "answers_en.answer_start": "answers_en_answer_start", "context_en": "context_en", "title_en": "title_en"}, "dataset_description": "\nScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.\n", "dataset_name": "alexandrainst/scandi-qa"}, "no": {"config_name": "no", "sample_row": "{\"id\": \"\\\"6277735658261425592\\\"\", \"question\": \"\\\"Hvor kommer uttrykket great scott fra?\\\"\", \"answers.text\": \"[\\\"\\\"]\", \"answers.answer_start\": \"[-1]\", \"context\": \"\\\"Great Scott\\\\nFlott Scott! er et innslag av overra...\", \"answers_en.text\": \"[\\\"\\\"]\", \"answers_en.answer_start\": \"[-1]\", \"context_en\": \"\\\"Great Scott! is an interjection of surprise, amaz...\", \"title_en\": \"\\\"Great Scott\\\"\"}", "columns": ["id", "question", "answers_text", "answers_answer_start", "context", "answers_en_text", "answers_en_answer_start", "context_en", "title_en"], "columns_mapping": {"id": "id", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "context": "context", "answers_en.text": "answers_en_text", "answers_en.answer_start": "answers_en_answer_start", "context_en": "context_en", "title_en": "title_en"}, "dataset_description": "\nScandiQA is a dataset of questions and answers in the Danish, Norwegian, and Swedish\nlanguages. All samples come from the Natural Questions (NQ) dataset, which is a large\nquestion answering dataset from Google searches. The Scandinavian questions and answers\ncome from the MKQA dataset, where 10,000 NQ samples were manually translated into,\namong others, Danish, Norwegian, and Swedish. However, this did not include a\ntranslated context, hindering the training of extractive question answering models.\n\nWe merged the NQ dataset with the MKQA dataset, and extracted contexts as either \"long\nanswers\" from the NQ dataset, being the paragraph in which the answer was found, or\notherwise we extract the context by locating the paragraphs which have the largest\ncosine similarity to the question, and which contains the desired answer.\n\nFurther, many answers in the MKQA dataset were \"language normalised\": for instance, all\ndate answers were converted to the format \"YYYY-MM-DD\", meaning that in most cases\nthese answers are not appearing in any paragraphs. We solve this by extending the MKQA\nanswers with plausible \"answer candidates\", being slight perturbations or translations\nof the answer.\n\nWith the contexts extracted, we translated these to Danish, Swedish and Norwegian using\nthe DeepL translation service for Danish and Swedish, and the Google Translation\nservice for Norwegian. After translation we ensured that the Scandinavian answers do\nindeed occur in the translated contexts.\n\nAs we are filtering the MKQA samples at both the \"merging stage\" and the \"translation\nstage\", we are not able to fully convert the 10,000 samples to the Scandinavian\nlanguages, and instead get roughly 8,000 samples per language. These have further been\nsplit into a training, validation and test split, with the former two containing\nroughly 750 samples. The splits have been created in such a way that the proportion of\nsamples without an answer is roughly the same in each split.\n", "dataset_name": "alexandrainst/scandi-qa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "multilinguality:multilingual", "source_datasets:mkqa", "source_datasets:natural_questions", "language:da", "language:sv", "language:no"], "is_gated": false}, "opus/liv4ever": {"dataset_name": "opus/liv4ever", "description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.", "downloads": 152, "configs": {"en-liv": {"config_name": "en-liv", "sample_row": "{\"translation.en\": \"\\\"Best wishes to our dear colleague, researcher of ...\", \"translation.liv\": \"\\\"P\\\\u01dfgi\\\\u0146 v\\\\u022fnn\\\\u00f5 m\\\\u00e4d kol\\\\u011...\"}", "columns": ["translation_en", "translation_liv"], "columns_mapping": {"translation.en": "translation_en", "translation.liv": "translation_liv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "et-liv": {"config_name": "et-liv", "sample_row": "{\"translation.et\": \"\\\"K\\\\u00e4ega lehvitab talle nagu n\\\\u00e4gemiseni.\\\"...\", \"translation.liv\": \"\\\"K\\\\u00e4dk\\\\u00f5ks v\\\\u0113tsi\\\\u0146\\\\u021b\\\\u00f5b t...\"}", "columns": ["translation_et", "translation_liv"], "columns_mapping": {"translation.et": "translation_et", "translation.liv": "translation_liv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "fr-liv": {"config_name": "fr-liv", "sample_row": "{\"translation.fr\": \"\\\"\\\\u00c9cartez-vous, s\\\\u00e9parez-vous\\\"\", \"translation.liv\": \"\\\"Lagg\\\\u00f5g\\\\u00f5d r\\\\u016bimig\\\\u00f5d\\\"\"}", "columns": ["translation_fr", "translation_liv"], "columns_mapping": {"translation.fr": "translation_fr", "translation.liv": "translation_liv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "liv-lv": {"config_name": "liv-lv", "sample_row": "{\"translation.liv\": \"\\\"K\\\\u00e4dk\\\\u00f5ks v\\\\u0113tsi\\\\u0146\\\\u021b\\\\u00f5b t...\", \"translation.lv\": \"\\\"Ar roku m\\\\u0101j vi\\\\u0146am it k\\\\u0101 uz redz\\\\u0...\"}", "columns": ["translation_liv", "translation_lv"], "columns_mapping": {"translation.liv": "translation_liv", "translation.lv": "translation_lv"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "en": {"config_name": "en", "sample_row": "{\"text\": \"\\\"As hydronyms are generally ancient, the names of ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "et": {"config_name": "et", "sample_row": "{\"text\": \"\\\"Kus sa l\\\\u00e4hed, marjaneitsi\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "fr": {"config_name": "fr", "sample_row": "{\"text\": \"\\\"\\\\u00c9cartez-vous, s\\\\u00e9parez-vous\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "liv": {"config_name": "liv", "sample_row": "{\"text\": \"\\\"Kus sa l\\\\u01dfd, M\\\\u014d\\\\u0157\\\\u00f5neitst\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}, "lv": {"config_name": "lv", "sample_row": "{\"text\": \"\\\"Valsts Prezidenta prieks\\\\u030cva\\\\u0304rds\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is the Livonian 4-lingual parallel corpus. Livonian is a Uralic / Finnic language with just about 20 fluent\nspeakers and no native speakers (as of 2021). The texts and translations in this corpus were collected from all the\ndigital text resources that could be found by the authors; scanned and printed materials are left for future work.\n", "dataset_name": "opus/liv4ever"}}, "tags": [], "is_gated": false}, "bigbio/biosses": {"dataset_name": "bigbio/biosses", "description": "BIOSSES computes similarity of biomedical sentences by utilizing WordNet as the\ngeneral domain ontology and UMLS as the biomedical domain specific ontology.\nThe original paper outlines the approaches with respect to using annotator\nscore as golden standard. Source view will return all annotator score\nindividually whereas the Bigbio view will return the mean of the annotator\nscore.", "downloads": 51, "configs": {"biosses_source": {"config_name": "biosses_source", "sample_row": "{\"id\": \"0\", \"document_id\": \"1\", \"text_1\": \"\\\"It has recently been shown that Craf is essential...\", \"text_2\": \"\\\"It has recently become evident that Craf is essen...\", \"annotator_a\": \"4\", \"annotator_b\": \"4\", \"annotator_c\": \"4\", \"annotator_d\": \"4\", \"annotator_e\": \"4\"}", "columns": ["id", "document_id", "text_1", "text_2", "annotator_a", "annotator_b", "annotator_c", "annotator_d", "annotator_e"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "annotator_a": "annotator_a", "annotator_b": "annotator_b", "annotator_c": "annotator_c", "annotator_d": "annotator_d", "annotator_e": "annotator_e"}, "dataset_description": "\nBIOSSES computes similarity of biomedical sentences by utilizing WordNet as the\ngeneral domain ontology and UMLS as the biomedical domain specific ontology.\nThe original paper outlines the approaches with respect to using annotator\nscore as golden standard. Source view will return all annotator score\nindividually whereas the Bigbio view will return the mean of the annotator\nscore.\n", "dataset_name": "bigbio/biosses"}, "biosses_bigbio_pairs": {"config_name": "biosses_bigbio_pairs", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"1\\\"\", \"text_1\": \"\\\"It has recently been shown that Craf is essential...\", \"text_2\": \"\\\"It has recently become evident that Craf is essen...\", \"label\": \"\\\"4.0\\\"\"}", "columns": ["id", "document_id", "text_1", "text_2", "label"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "label": "label"}, "dataset_description": "\nBIOSSES computes similarity of biomedical sentences by utilizing WordNet as the\ngeneral domain ontology and UMLS as the biomedical domain specific ontology.\nThe original paper outlines the approaches with respect to using annotator\nscore as golden standard. Source view will return all annotator score\nindividually whereas the Bigbio view will return the mean of the annotator\nscore.\n", "dataset_name": "bigbio/biosses"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "eraldoluis/faquad": {"dataset_name": "eraldoluis/faquad", "description": "Academic secretaries and faculty members of higher education institutions face a common problem: \n the abundance of questions sent by academics \n whose answers are found in available institutional documents. \nThe official documents produced by Brazilian public universities are vast and disperse, \n which discourage students to further search for answers in such sources.\nIn order to lessen this problem, we present FaQuAD: \n a novel machine reading comprehension dataset \n in the domain of Brazilian higher education institutions. \nFaQuAD follows the format of SQuAD (Stanford Question Answering Dataset) [Rajpurkar et al. 2016]. \nIt comprises 900 questions about 249 reading passages (paragraphs), \n which were taken from 18 official documents of a computer science college \n from a Brazilian federal university \n and 21 Wikipedia articles related to Brazilian higher education system. \nAs far as we know, this is the first Portuguese reading comprehension dataset in this format.", "downloads": 186, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"26f2ae969a804ba392e5bd0c62d58896\\\"\", \"title\": \"\\\"UFMS\\\"\", \"context\": \"\\\"Universidade Federal de Mato Grosso do Sul (UFMS)...\", \"question\": \"\\\"O que \\\\u00e9 a UFMS?\\\"\", \"answers.text\": \"[\\\"uma institui\\\\u00e7\\\\u00e3o de ensino superior p\\\\u...\", \"answers.answer_start\": \"[52, 52, 52]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Academic secretaries and faculty members of higher education institutions face a common problem: \n the abundance of questions sent by academics \n whose answers are found in available institutional documents. \nThe official documents produced by Brazilian public universities are vast and disperse, \n which discourage students to further search for answers in such sources.\nIn order to lessen this problem, we present FaQuAD: \n a novel machine reading comprehension dataset \n in the domain of Brazilian higher education institutions. \nFaQuAD follows the format of SQuAD (Stanford Question Answering Dataset) [Rajpurkar et al. 2016]. \nIt comprises 900 questions about 249 reading passages (paragraphs), \n which were taken from 18 official documents of a computer science college \n from a Brazilian federal university \n and 21 Wikipedia articles related to Brazilian higher education system. \nAs far as we know, this is the first Portuguese reading comprehension dataset in this format.\n", "dataset_name": "eraldoluis/faquad"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|wikipedia", "language:pt"], "is_gated": false}, "neulab/conala": {"dataset_name": "neulab/conala", "description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.", "downloads": 504, "configs": {"curated": {"config_name": "curated", "sample_row": "{\"question_id\": \"41067960\", \"intent\": \"\\\"How to convert a list of multiple integers into a...\", \"rewritten_intent\": \"\\\"Concatenate elements of a list 'x' of multiple in...\", \"snippet\": \"\\\"sum(d * 10 ** i for i, d in enumerate(x[::-1]))\\\"...\"}", "columns": ["question_id", "intent", "rewritten_intent", "snippet"], "columns_mapping": {"question_id": "question_id", "intent": "intent", "rewritten_intent": "rewritten_intent", "snippet": "snippet"}, "dataset_description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n", "dataset_name": "neulab/conala"}, "mined": {"config_name": "mined", "sample_row": "{\"question_id\": \"34705205\", \"parent_answer_post_id\": \"34705233\", \"prob\": \"0.8690001442846342\", \"snippet\": \"\\\"sorted(l, key=lambda x: (-int(x[1]), x[0]))\\\"\", \"intent\": \"\\\"Sort a nested list by two elements\\\"\", \"id\": \"\\\"34705205_34705233_0\\\"\"}", "columns": ["question_id", "parent_answer_post_id", "prob", "snippet", "intent", "id"], "columns_mapping": {"question_id": "question_id", "parent_answer_post_id": "parent_answer_post_id", "prob": "prob", "snippet": "snippet", "intent": "intent", "id": "id"}, "dataset_description": "CoNaLa is a dataset of code and natural language pairs crawled from Stack Overflow, for more details please refer to this paper: https://arxiv.org/pdf/1805.08949.pdf or the dataset page https://conala-corpus.github.io/.\n", "dataset_name": "neulab/conala"}}, "tags": ["task_categories:text2text-generation", "multilinguality:monolingual", "source_datasets:original", "language:code", "code-generation"], "is_gated": false}, "codesue/kelly": {"dataset_name": "codesue/kelly", "description": "The Swedish Kelly list is a freely available frequency-based vocabulary list that comprises general-purpose language of modern Swedish. The list was generated from a large web-acquired corpus (SweWaC) of 114 million words dating from the 2010s. It is adapted to the needs of language learners and contains 8,425 most frequent lemmas that cover 80% of SweWaC.\\", "downloads": 12, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"1\", \"raw_frequency\": \"NaN\", \"relative_frequency\": \"NaN\", \"cefr_level\": \"\\\"A1\\\"\", \"source\": \"\\\"manual\\\"\", \"marker\": \"\\\"\\\"\", \"lemma\": \"\\\"andra\\\"\", \"pos\": \"\\\"numeral\\\"\", \"examples\": \"\\\"\\\"\"}", "columns": ["id", "raw_frequency", "relative_frequency", "cefr_level", "source", "marker", "lemma", "pos", "examples"], "columns_mapping": {"id": "id", "raw_frequency": "raw_frequency", "relative_frequency": "relative_frequency", "cefr_level": "cefr_level", "source": "source", "marker": "marker", "lemma": "lemma", "pos": "pos", "examples": "examples"}, "dataset_description": "The Swedish Kelly list is a freely available frequency-based vocabulary list that comprises general-purpose language of modern Swedish. The list was generated from a large web-acquired corpus (SweWaC) of 114 million words dating from the 2010s. It is adapted to the needs of language learners and contains 8,425 most frequent lemmas that cover 80% of SweWaC.", "dataset_name": "codesue/kelly"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:sv", "lexicon", "swedish", "CEFR"], "is_gated": false}, "PlanTL-GOB-ES/wnli-es": {"dataset_name": "PlanTL-GOB-ES/wnli-es", "description": "professional translation into Spanish of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).", "downloads": 52, "configs": {"winograd": {"config_name": "winograd", "sample_row": "{\"sentence1\": \"\\\"Clav\\\\u00e9 una aguja en una zanahoria. Cuando saq...\", \"sentence2\": \"\\\"La zanahoria ten\\\\u00eda un agujero.\\\"\", \"label\": \"1\"}", "columns": ["sentence1", "sentence2", "label"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "\n professional translation into Spanish of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).\n ", "dataset_name": "PlanTL-GOB-ES/wnli-es"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|glue", "language:es"], "is_gated": false}, "bigbio/gad": {"dataset_name": "bigbio/gad", "description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database", "downloads": 154, "configs": {"gad_fold0_source": {"config_name": "gad_fold0_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"this study proposes that A/A genotype at position...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold1_source": {"config_name": "gad_fold1_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"The @GENE$ Asp allele may be a genetic risk facto...\", \"label\": \"0\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold2_source": {"config_name": "gad_fold2_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"The @GENE$ gene is likely to be involved in the g...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold3_source": {"config_name": "gad_fold3_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"In conclusion, a significant association between ...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold4_source": {"config_name": "gad_fold4_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"An interaction with hypertension in the associati...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold5_source": {"config_name": "gad_fold5_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"The polymorphism of @GENE$ promoter -969(G>C) is ...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold6_source": {"config_name": "gad_fold6_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"These results indicate that mutations in NLGN3 an...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold7_source": {"config_name": "gad_fold7_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"This study shows that the @GENE$ gene promoter po...\", \"label\": \"1\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold8_source": {"config_name": "gad_fold8_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"Our findings suggest that the increased productio...\", \"label\": \"0\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold9_source": {"config_name": "gad_fold9_source", "sample_row": "{\"index\": \"\\\"0\\\"\", \"sentence\": \"\\\"Our results support that @GENE$ and CD-105 are cl...\", \"label\": \"0\"}", "columns": ["index", "sentence", "label"], "columns_mapping": {"index": "index", "sentence": "sentence", "label": "label"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold0_bigbio_text": {"config_name": "gad_fold0_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"this study proposes that A/A genotype at position...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold1_bigbio_text": {"config_name": "gad_fold1_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"The @GENE$ Asp allele may be a genetic risk facto...\", \"labels\": \"[\\\"0\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold2_bigbio_text": {"config_name": "gad_fold2_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"The @GENE$ gene is likely to be involved in the g...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold3_bigbio_text": {"config_name": "gad_fold3_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"In conclusion, a significant association between ...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold4_bigbio_text": {"config_name": "gad_fold4_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"An interaction with hypertension in the associati...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold5_bigbio_text": {"config_name": "gad_fold5_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"The polymorphism of @GENE$ promoter -969(G>C) is ...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold6_bigbio_text": {"config_name": "gad_fold6_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"These results indicate that mutations in NLGN3 an...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold7_bigbio_text": {"config_name": "gad_fold7_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"This study shows that the @GENE$ gene promoter po...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold8_bigbio_text": {"config_name": "gad_fold8_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"Our findings suggest that the increased productio...\", \"labels\": \"[\\\"0\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_fold9_bigbio_text": {"config_name": "gad_fold9_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"Our results support that @GENE$ and CD-105 are cl...\", \"labels\": \"[\\\"0\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}, "gad_blurb_bigbio_text": {"config_name": "gad_blurb_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"text\": \"\\\"this study proposes that A/A genotype at position...\", \"labels\": \"[\\\"1\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "A corpus identifying associations between genes and diseases by a semi-automatic\nannotation procedure based on the Genetic Association Database\n", "dataset_name": "bigbio/gad"}}, "tags": ["multilinguality:momolingual", "language:en"], "is_gated": false}, "bigbio/blurb": {"dataset_name": "bigbio/blurb", "description": "The BioCreative II Gene Mention task. The training corpus for the current task consists mainly of the training and testing corpora (text collections) from the BCI task, and the testing corpus for the current task consists of an additional 5,000 sentences that were held 'in reserve' from the previous task. In the current corpus, tokenization is not provided; instead participants are asked to identify a gene mention in a sentence by giving its start and end characters. As before, the training set consists of a set of sentences, and for each sentence a set of gene mentions (GENE annotations).\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/task-1a-gene-mention-tagging/\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: Overview of BioCreative II gene mention recognition\n https://link.springer.com/article/10.1186/gb-2008-9-s2-s2", "downloads": 184, "configs": {"bc5chem": {"config_name": "bc5chem", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Selegiline\\\", \\\"-\\\", \\\"induced\\\", \\\"postural\\\", \\\"hypote...\", \"type\": \"\\\"chemical\\\"\", \"ner_tags\": \"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The corpus consists of three separate sets of articles with diseases, chemicals and their relations annotated. The training (500 articles) and development (500 articles) sets were released to task participants in advance to support text-mining method development. The test set (500 articles) was used for final system performance evaluation.\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-v-cdr-corpus\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: BioCreative V CDR task corpus: a resource for chemical disease relation extraction\n https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/\n", "dataset_name": "bigbio/blurb"}, "bc5disease": {"config_name": "bc5disease", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Selegiline\\\", \\\"-\\\", \\\"induced\\\", \\\"postural\\\", \\\"hypote...\", \"type\": \"\\\"disease\\\"\", \"ner_tags\": \"[0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The corpus consists of three separate sets of articles with diseases, chemicals and their relations annotated. The training (500 articles) and development (500 articles) sets were released to task participants in advance to support text-mining method development. The test set (500 articles) was used for final system performance evaluation.\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-v-cdr-corpus\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: BioCreative V CDR task corpus: a resource for chemical disease relation extraction\n https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4860626/\n", "dataset_name": "bigbio/blurb"}, "bc2gm": {"config_name": "bc2gm", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Immunohistochemical\\\", \\\"staining\\\", \\\"was\\\", \\\"positi...\", \"type\": \"\\\"gene\\\"\", \"ner_tags\": \"[0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1...\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The BioCreative II Gene Mention task. The training corpus for the current task consists mainly of the training and testing corpora (text collections) from the BCI task, and the testing corpus for the current task consists of an additional 5,000 sentences that were held 'in reserve' from the previous task. In the current corpus, tokenization is not provided; instead participants are asked to identify a gene mention in a sentence by giving its start and end characters. As before, the training set consists of a set of sentences, and for each sentence a set of gene mentions (GENE annotations).\n\n- Homepage: https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/task-1a-gene-mention-tagging/\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: Overview of BioCreative II gene mention recognition\n https://link.springer.com/article/10.1186/gb-2008-9-s2-s2\n", "dataset_name": "bigbio/blurb"}, "jnlpba": {"config_name": "jnlpba", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"-DOCSTART-\\\"]\", \"type\": \"\\\"protein, DNA, RNA, cell line, or cell type\\\"\", \"ner_tags\": \"[0]\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "The BioNLP / JNLPBA Shared Task 2004 involves the identification and classification of technical terms referring to concepts of interest to biologists in the domain of molecular biology. The task was organized by GENIA Project based on the annotations of the GENIA Term corpus (version 3.02).\n\n- Homepage: http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: Introduction to the Bio-entity Recognition Task at JNLPBA\n https://aclanthology.org/W04-1213\n", "dataset_name": "bigbio/blurb"}, "ncbi_disease": {"config_name": "ncbi_disease", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Identification\\\", \\\"of\\\", \\\"APC2\\\", \\\",\\\", \\\"a\\\", \\\"homolo...\", \"type\": \"\\\"disease\\\"\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]\"}", "columns": ["id", "tokens", "type", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "type": "type", "ner_tags": "ner_tags"}, "dataset_description": "[T]he NCBI disease corpus contains 6,892 disease mentions, which are mapped to 790 unique disease concepts. Of these, 88% link to a MeSH identifier, while the rest contain an OMIM identifier. We were able to link 91% of the mentions to a single disease concept, while the rest are described as a combination of concepts.\n\n- Homepage: https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/\n- Repository: https://github.com/cambridgeltl/MTL-Bioinformatics-2016/raw/master/data/\n- Paper: NCBI disease corpus: a resource for disease name recognition and concept normalization\n https://pubmed.ncbi.nlm.nih.gov/24393765/\n", "dataset_name": "bigbio/blurb"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "RussianNLP/tape": {"dataset_name": "RussianNLP/tape", "description": "The Winograd schema challenge composes tasks with syntactic ambiguity,\nwhich can be resolved with logic and reasoning (Levesque et al., 2012).\n\nThe texts for the Winograd schema problem are obtained using a semi-automatic \npipeline. First, lists of 11 typical grammatical structures with syntactic \nhomonymy (mainly case) are compiled. For example, two noun phrases with a \ncomplex subordinate: 'A trinket from Pompeii that has survived the centuries'.\nRequests corresponding to these constructions are submitted in search of the \nRussian National Corpus, or rather its sub-corpus with removed homonymy. In the \nresulting 2+k examples, homonymy is removed automatically with manual validation\nafterward. Each original sentence is split into multiple examples in the binary \nclassification format, indicating whether the homonymy is resolved correctly or\nnot.", "downloads": 149, "configs": {"winograd.raw": {"config_name": "winograd.raw", "sample_row": "{\"text\": \"\\\"\\\\u041d\\\\u043e \\\\u043f\\\\u043e\\\\u0442\\\\u043e\\\\u043c \\\\u044...\", \"label\": \"0\", \"options\": \"[\\\"\\\\u043f\\\\u0435\\\\u0432\\\\u0438\\\\u0446\\\\u0430\\\", \\\"\\\\u0442\\\\u...\", \"reference\": \"\\\"\\\\u043a\\\\u043e\\\\u0442\\\\u043e\\\\u0440\\\\u0430\\\\u044f\\\"\", \"homonymia_type\": \"1.1\", \"answer\": \"\\\"\\\\u0442\\\\u0443\\\\u0440\\\\u0446\\\\u0438\\\\u0438\\\"\"}", "columns": ["text", "label", "options", "reference", "homonymia_type", "answer"], "columns_mapping": {"text": "text", "label": "label", "options": "options", "reference": "reference", "homonymia_type": "homonymia_type", "answer": "answer"}, "dataset_description": "The Winograd schema challenge composes tasks with syntactic ambiguity,\nwhich can be resolved with logic and reasoning (Levesque et al., 2012).\n\nThe texts for the Winograd schema problem are obtained using a semi-automatic \npipeline. First, lists of 11 typical grammatical structures with syntactic \nhomonymy (mainly case) are compiled. For example, two noun phrases with a \ncomplex subordinate: 'A trinket from Pompeii that has survived the centuries'.\nRequests corresponding to these constructions are submitted in search of the \nRussian National Corpus, or rather its sub-corpus with removed homonymy. In the \nresulting 2+k examples, homonymy is removed automatically with manual validation\nafterward. Each original sentence is split into multiple examples in the binary \nclassification format, indicating whether the homonymy is resolved correctly or\nnot.", "dataset_name": "RussianNLP/tape"}, "ru_openbook.raw": {"config_name": "ru_openbook.raw", "sample_row": "{\"ID\": \"\\\"7-980\\\"\", \"question\": \"\\\"\\\\u0421\\\\u043e\\\\u043b\\\\u043d\\\\u0446\\\\u0435 \\\\u043e\\\\u0442...\", \"answer\": \"\\\"D\\\"\"}", "columns": ["ID", "question", "answer"], "columns_mapping": {"ID": "ID", "question": "question", "answer": "answer"}, "dataset_description": "OpenBookQA for Russian is mainly based on the work of (Mihaylov et al., 2018):\nit is a QA dataset with multiple-choice elementary-level science questions, \nwhich probe the understanding of 1k+ core science facts. The dataset is mainly \ncomposed of automatic translation and human validation and correction. ", "dataset_name": "RussianNLP/tape"}, "ru_worldtree.raw": {"config_name": "ru_worldtree.raw", "sample_row": "{\"question\": \"\\\"\\\\u041d\\\\u0435\\\\u043a\\\\u043e\\\\u0442\\\\u043e\\\\u0440\\\\u044b\\\\...\", \"exam_name\": \"\\\"MCAS\\\"\", \"school_grade\": \"\\\"5\\\"\", \"knowledge_type\": \"\\\"MODEL\\\"\", \"answer\": \"\\\"C\\\"\"}", "columns": ["question", "exam_name", "school_grade", "knowledge_type", "answer"], "columns_mapping": {"question": "question", "exam_name": "exam_name", "school_grade": "school_grade", "knowledge_type": "knowledge_type", "answer": "answer"}, "dataset_description": "The WorldTree task is very similar to the pipeline on the OpenBookQA, the main\ndifference being the additional lists of facts and the logical order that is \nattached to the output of each answer to a question (Jansen et al., 2018).", "dataset_name": "RussianNLP/tape"}, "multiq.raw": {"config_name": "multiq.raw", "sample_row": "{\"support_text\": \"\\\"\\\\u0414\\\\u0430\\\\u043d\\\\u0438\\\\u0435\\\\u043b (\\\\u0414\\\\u043...\", \"main_text\": \"\\\"\\\\u0427\\\\u0424\\\\u0420 \\\\u041a\\\\u043b\\\\u0443\\\\u0436 \\\\u201...\", \"question\": \"\\\"\\\\u0412 \\\\u043a\\\\u0430\\\\u043a\\\\u043e\\\\u0439 \\\\u043b\\\\u043...\", \"bridge_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 738, \\\"length\\\": 8, ...\", \"main_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 294, \\\"length\\\": 14,...\"}", "columns": ["support_text", "main_text", "question", "bridge_answers", "main_answers"], "columns_mapping": {"support_text": "support_text", "main_text": "main_text", "question": "question", "bridge_answers": "bridge_answers", "main_answers": "main_answers"}, "dataset_description": "Multi-hop reasoning has been the least addressed QA direction for Russian. We \nhave developed a semi-automatic pipeline for multi-hop dataset generation based \non Wikidata.\n\nFirst, we extract the triplets from Wikidata and search for their intersections. \nTwo triplets (subject, verb, object) are needed to compose an answerable multi-hop \nquestion. For instance, the question 'What continent is the country of which \nJohannes Block was a citizen?' is formed by a sequence of five graph units: 'Block, \nJohannes', 'citizenship', 'Germany', 'part of the world', 'Europe'. Second, several \nhundreds of the question templates are curated by a few authors manually, which are\nfurther used to fine-tune ruT5-largeto generate multi-hop questions given a \nfive-fold sequence. Third, the resulting questions undergo a paraphrasing and manual\nvalidation procedure to control the quality and diversity. Finally, each question is\nlinked to two Wikipedia paragraphs, where all graph units appear in the natural \nlanguage. The task is to select the answer span using information from both \nparagraphs.", "dataset_name": "RussianNLP/tape"}, "chegeka.raw": {"config_name": "chegeka.raw", "sample_row": "{\"question_id\": \"0\", \"question\": \"\\\"\\\\u0421\\\\u043a\\\\u0430\\\\u0436\\\\u0438\\\\u0442\\\\u0435 \\\\u043f...\", \"topic\": \"\\\"\\\\u0412 \\\\u041f\\\\u0415\\\\u0420\\\\u0415\\\\u0412\\\\u041e\\\\u0414...\", \"author\": \"\\\"\\\\u042e\\\\u0440\\\\u0438\\\\u0439 \\\\u0413\\\\u0440\\\\u0438\\\\u0448...\", \"tour_name\": \"\\\"\\\\\\\"\\\\u0421\\\\u0432\\\\u043e\\\\u044f \\\\u0438\\\\u0433\\\\u0440\\\\u04...\", \"tour_link\": \"\\\"https://db.chgk.info/tour/grishov\\\"\", \"answer\": \"\\\"\\\\u0422\\\\u0430\\\\u043d\\\\u043a\\\\u0430\\\"\"}", "columns": ["question_id", "question", "topic", "author", "tour_name", "tour_link", "answer"], "columns_mapping": {"question_id": "question_id", "question": "question", "topic": "topic", "author": "author", "tour_name": "tour_name", "tour_link": "tour_link", "answer": "answer"}, "dataset_description": "The CheGeKa game setup is similar to Jeopardy. The player should come up with \nthe answer to the question basing on wit, commonsense and deep knowledge. \nThe task format is QA with a free response form and is based on the reviewed \nunpublished data subsets by (Mikhalkova, 2021).", "dataset_name": "RussianNLP/tape"}, "sit_ethics.raw": {"config_name": "sit_ethics.raw", "sample_row": "{\"source\": \"\\\"lenta\\\"\", \"text\": \"\\\"\\\\u0420\\\\u043e\\\\u0441\\\\u0441\\\\u0438\\\\u044f\\\\u043d\\\\u0435 ...\", \"sit_virtue\": \"0\", \"sit_moral\": \"1\", \"sit_law\": \"0\", \"sit_justice\": \"0\", \"sit_util\": \"0\"}", "columns": ["source", "text", "sit_virtue", "sit_moral", "sit_law", "sit_justice", "sit_util"], "columns_mapping": {"source": "source", "text": "text", "sit_virtue": "sit_virtue", "sit_moral": "sit_moral", "sit_law": "sit_law", "sit_justice": "sit_justice", "sit_util": "sit_util"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to identify the \npresence of concepts in normative ethics, such as virtue, law, moral, justice, and \nutilitarianism.", "dataset_name": "RussianNLP/tape"}, "per_ethics.raw": {"config_name": "per_ethics.raw", "sample_row": "{\"source\": \"\\\"lenta\\\"\", \"text\": \"\\\"\\\\u0416\\\\u0443\\\\u0440\\\\u043d\\\\u0430\\\\u043b\\\\u0438\\\\u0441\\\\...\", \"per_virtue\": \"1\", \"per_moral\": \"0\", \"per_law\": \"1\", \"per_justice\": \"1\", \"per_util\": \"0\"}", "columns": ["source", "text", "per_virtue", "per_moral", "per_law", "per_justice", "per_util"], "columns_mapping": {"source": "source", "text": "text", "per_virtue": "per_virtue", "per_moral": "per_moral", "per_law": "per_law", "per_justice": "per_justice", "per_util": "per_util"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to evaluate the \npositive or negative implementation of five concepts in normative ethics (virtue, \nlaw, moral, justice, and utilitarianism) with 'yes' and 'no' ratings.", "dataset_name": "RussianNLP/tape"}, "winograd.episodes": {"config_name": "winograd.episodes", "sample_row": "{\"text\": \"\\\"\\\\u041d\\\\u0435 \\\\u043c\\\\u0435\\\\u043d\\\\u0435\\\\u0435 \\\\u043...\", \"label\": \"1\", \"options\": \"[\\\"\\\\u043f\\\\u0430\\\\u043b\\\\u044c\\\\u043c\\\\u0430\\\", \\\"\\\\u0410\\\\u...\", \"reference\": \"\\\"\\\\u043a\\\\u043e\\\\u0442\\\\u043e\\\\u0440\\\\u0430\\\\u044f\\\"\", \"homonymia_type\": \"1.1\", \"answer\": \"\\\"\\\\u043f\\\\u0430\\\\u043b\\\\u044c\\\\u043c\\\\u0430\\\"\", \"perturbation\": \"\\\"winograd\\\"\", \"episode\": \"[15]\"}", "columns": ["text", "label", "options", "reference", "homonymia_type", "answer", "perturbation", "episode"], "columns_mapping": {"text": "text", "label": "label", "options": "options", "reference": "reference", "homonymia_type": "homonymia_type", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The Winograd schema challenge composes tasks with syntactic ambiguity,\nwhich can be resolved with logic and reasoning (Levesque et al., 2012).\n\nThe texts for the Winograd schema problem are obtained using a semi-automatic \npipeline. First, lists of 11 typical grammatical structures with syntactic \nhomonymy (mainly case) are compiled. For example, two noun phrases with a \ncomplex subordinate: 'A trinket from Pompeii that has survived the centuries'.\nRequests corresponding to these constructions are submitted in search of the \nRussian National Corpus, or rather its sub-corpus with removed homonymy. In the \nresulting 2+k examples, homonymy is removed automatically with manual validation\nafterward. Each original sentence is split into multiple examples in the binary \nclassification format, indicating whether the homonymy is resolved correctly or\nnot.", "dataset_name": "RussianNLP/tape"}, "ru_openbook.episodes": {"config_name": "ru_openbook.episodes", "sample_row": "{\"ID\": \"\\\"7-674\\\"\", \"question\": \"\\\"\\\\u0415\\\\u0441\\\\u043b\\\\u0438 \\\\u0436\\\\u0438\\\\u0432\\\\u043e...\", \"answer\": \"\\\"A\\\"\", \"perturbation\": \"\\\"ru_openbook\\\"\", \"episode\": \"[11]\"}", "columns": ["ID", "question", "answer", "perturbation", "episode"], "columns_mapping": {"ID": "ID", "question": "question", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "OpenBookQA for Russian is mainly based on the work of (Mihaylov et al., 2018):\nit is a QA dataset with multiple-choice elementary-level science questions, \nwhich probe the understanding of 1k+ core science facts. The dataset is mainly \ncomposed of automatic translation and human validation and correction. ", "dataset_name": "RussianNLP/tape"}, "ru_worldtree.episodes": {"config_name": "ru_worldtree.episodes", "sample_row": "{\"question\": \"\\\"\\\\u0422\\\\u0443\\\\u043d\\\\u0435\\\\u0446 - \\\\u044d\\\\u0442\\\\u04...\", \"exam_name\": \"\\\"MCAS\\\"\", \"school_grade\": \"\\\"5\\\"\", \"knowledge_type\": \"\\\"CAUSAL,MODEL\\\"\", \"answer\": \"\\\"A\\\"\", \"perturbation\": \"\\\"ru_worldtree\\\"\", \"episode\": \"[10, 11]\"}", "columns": ["question", "exam_name", "school_grade", "knowledge_type", "answer", "perturbation", "episode"], "columns_mapping": {"question": "question", "exam_name": "exam_name", "school_grade": "school_grade", "knowledge_type": "knowledge_type", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The WorldTree task is very similar to the pipeline on the OpenBookQA, the main\ndifference being the additional lists of facts and the logical order that is \nattached to the output of each answer to a question (Jansen et al., 2018).", "dataset_name": "RussianNLP/tape"}, "multiq.episodes": {"config_name": "multiq.episodes", "sample_row": "{\"support_text\": \"\\\"\\\\u041f\\\\u0430\\\\u0431\\\\u043b\\\\u043e \\\\u0410\\\\u043d\\\\u0434...\", \"main_text\": \"\\\"'\\\\u0411\\\\u0430\\\\u043d\\\\u0444\\\\u0438\\\\u043b\\\\u0434' (\\\\u0...\", \"question\": \"\\\"\\\\u0412 \\\\u043a\\\\u0430\\\\u043a\\\\u043e\\\\u0439 \\\\u043b\\\\u043...\", \"bridge_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 528, \\\"length\\\": 8, ...\", \"main_answers\": \"[{\\\"label\\\": \\\"passage\\\", \\\"offset\\\": 350, \\\"length\\\": 16,...\", \"perturbation\": \"\\\"multiq\\\"\", \"episode\": \"[18]\"}", "columns": ["support_text", "main_text", "question", "bridge_answers", "main_answers", "perturbation", "episode"], "columns_mapping": {"support_text": "support_text", "main_text": "main_text", "question": "question", "bridge_answers": "bridge_answers", "main_answers": "main_answers", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "Multi-hop reasoning has been the least addressed QA direction for Russian. We \nhave developed a semi-automatic pipeline for multi-hop dataset generation based \non Wikidata.\n\nFirst, we extract the triplets from Wikidata and search for their intersections. \nTwo triplets (subject, verb, object) are needed to compose an answerable multi-hop \nquestion. For instance, the question 'What continent is the country of which \nJohannes Block was a citizen?' is formed by a sequence of five graph units: 'Block, \nJohannes', 'citizenship', 'Germany', 'part of the world', 'Europe'. Second, several \nhundreds of the question templates are curated by a few authors manually, which are\nfurther used to fine-tune ruT5-largeto generate multi-hop questions given a \nfive-fold sequence. Third, the resulting questions undergo a paraphrasing and manual\nvalidation procedure to control the quality and diversity. Finally, each question is\nlinked to two Wikipedia paragraphs, where all graph units appear in the natural \nlanguage. The task is to select the answer span using information from both \nparagraphs.", "dataset_name": "RussianNLP/tape"}, "chegeka.episodes": {"config_name": "chegeka.episodes", "sample_row": "{\"question_id\": \"966\", \"question\": \"\\\"\\\\\\\"\\\\u041a\\\\u0430\\\\u0436\\\\u0434\\\\u0443\\\\u044e \\\\u043d\\\\u04...\", \"topic\": \"\\\"\\\\u041f\\\\u0435\\\\u0441\\\\u043d\\\\u0438-25\\\"\", \"author\": \"\\\"\\\\u0414\\\\u043c\\\\u0438\\\\u0442\\\\u0440\\\\u0438\\\\u0439 \\\\u0411...\", \"tour_name\": \"\\\"\\\\\\\"\\\\u0421\\\\u0432\\\\u043e\\\\u044f \\\\u0438\\\\u0433\\\\u0440\\\\u04...\", \"tour_link\": \"\\\"https://db.chgk.info/tour/spbrock\\\"\", \"answer\": \"\\\"\\\\u041e\\\\u043a\\\\u043d\\\\u0430\\\"\", \"perturbation\": \"\\\"chegeka\\\"\", \"episode\": \"[13, 18]\"}", "columns": ["question_id", "question", "topic", "author", "tour_name", "tour_link", "answer", "perturbation", "episode"], "columns_mapping": {"question_id": "question_id", "question": "question", "topic": "topic", "author": "author", "tour_name": "tour_name", "tour_link": "tour_link", "answer": "answer", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The CheGeKa game setup is similar to Jeopardy. The player should come up with \nthe answer to the question basing on wit, commonsense and deep knowledge. \nThe task format is QA with a free response form and is based on the reviewed \nunpublished data subsets by (Mikhalkova, 2021).", "dataset_name": "RussianNLP/tape"}, "sit_ethics.episodes": {"config_name": "sit_ethics.episodes", "sample_row": "{\"source\": \"\\\"gazeta\\\"\", \"text\": \"\\\"\\\\u042d\\\\u043a\\\\u0441-\\\\u043d\\\\u0430\\\\u0441\\\\u0442\\\\u0430...\", \"sit_virtue\": \"0\", \"sit_moral\": \"0\", \"sit_law\": \"0\", \"sit_justice\": \"0\", \"sit_util\": \"0\", \"perturbation\": \"\\\"sit_ethics\\\"\", \"episode\": \"[5]\"}", "columns": ["source", "text", "sit_virtue", "sit_moral", "sit_law", "sit_justice", "sit_util", "perturbation", "episode"], "columns_mapping": {"source": "source", "text": "text", "sit_virtue": "sit_virtue", "sit_moral": "sit_moral", "sit_law": "sit_law", "sit_justice": "sit_justice", "sit_util": "sit_util", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to identify the \npresence of concepts in normative ethics, such as virtue, law, moral, justice, and \nutilitarianism.", "dataset_name": "RussianNLP/tape"}, "per_ethics.episodes": {"config_name": "per_ethics.episodes", "sample_row": "{\"source\": \"\\\"interfax\\\"\", \"text\": \"\\\"\\\\u0412\\\\u0430\\\\u0448\\\\u0438\\\\u043d\\\\u0433\\\\u0442\\\\u043e\\\\...\", \"per_virtue\": \"1\", \"per_moral\": \"0\", \"per_law\": \"1\", \"per_justice\": \"1\", \"per_util\": \"0\", \"perturbation\": \"\\\"per_ethics\\\"\", \"episode\": \"[5]\"}", "columns": ["source", "text", "per_virtue", "per_moral", "per_law", "per_justice", "per_util", "perturbation", "episode"], "columns_mapping": {"source": "source", "text": "text", "per_virtue": "per_virtue", "per_moral": "per_moral", "per_law": "per_law", "per_justice": "per_justice", "per_util": "per_util", "perturbation": "perturbation", "episode": "episode"}, "dataset_description": "The Ethics dataset for Russian is created from scratch for the first time, relying \non the design compatible with (Hendrycks et al., 2021). The task is to predict \nhuman ethical judgments about diverse text situations, namely, to evaluate the \npositive or negative implementation of five concepts in normative ethics (virtue, \nlaw, moral, justice, and utilitarianism) with 'yes' and 'no' ratings.", "dataset_name": "RussianNLP/tape"}}, "tags": ["task_categories:text-classification", "task_categories:question-answering", "task_categories:multiple-choice", "language:ru", "benchmark", "ethics", "question-answering", "reasoning"], "is_gated": false}, "csebuetnlp/BanglaParaphrase": {"dataset_name": "csebuetnlp/BanglaParaphrase", "description": "We present a high quality bangla paraphrase dataset containing about 466k paraphrase pairs. The paraphrases ensures high quality by being semantically coherent and syntactically diverse.", "downloads": 14, "configs": {"bn": {"config_name": "bn", "sample_row": "{\"source\": \"\\\"\\\\u09ac\\\\u09bf\\\\u09ae\\\\u09be\\\\u09a8\\\\u099f\\\\u09bf \\\\u09af...\", \"target\": \"\\\"\\\\u09ac\\\\u09bf\\\\u09ae\\\\u09be\\\\u09a8\\\\u099f\\\\u09be \\\\u098f...\"}", "columns": ["source", "target"], "columns_mapping": {"source": "source", "target": "target"}, "dataset_description": "We present a high quality bangla paraphrase dataset containing about 466k paraphrase pairs. The paraphrases ensures high quality by being semantically coherent and syntactically diverse.\n\n", "dataset_name": "csebuetnlp/BanglaParaphrase"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:bn", "conditional-text-generation", "paraphrase-generation"], "is_gated": false}, "elenanereiss/german-ler": {"dataset_name": "elenanereiss/german-ler", "description": "A dataset of Legal Documents from German federal court decisions for Named Entity Recognition. The dataset is human-annotated with 19 fine-grained entity classes. The dataset consists of approx. 67,000 sentences and contains 54,000 annotated entities.", "downloads": 61, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"dd\\\", \\\")\\\", \\\"Art.\\\", \\\"33\\\", \\\"Abs.\\\", \\\"5\\\", \\\"GG\\\", \\\"w\\\\u0...\", \"ner_tags\": \"[38, 38, 3, 22, 22, 22, 22, 38, 38, 38, 38, 38, 38...\", \"ner_coarse_tags\": \"[14, 14, 2, 9, 9, 9, 9, 14, 14, 14, 14, 14, 14, 14...\"}", "columns": ["id", "tokens", "ner_tags", "ner_coarse_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_coarse_tags": "ner_coarse_tags"}, "dataset_description": "A dataset of Legal Documents from German federal court decisions for Named Entity Recognition. The dataset is human-annotated with 19 fine-grained entity classes. The dataset consists of approx. 67,000 sentences and contains 54,000 annotated entities.\n", "dataset_name": "elenanereiss/german-ler"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:de", "ner, named entity recognition, legal ner, legal texts, label classification", "doi:10.57967/hf/0046"], "is_gated": false}, "taln-ls2n/kpbiomed": {"dataset_name": "taln-ls2n/kpbiomed", "description": "KPBiomed benchmark dataset for keyphrase extraction an generation.", "downloads": 18, "configs": {"large": {"config_name": "large", "sample_row": "{\"id\": \"\\\"31703611\\\"\", \"title\": \"\\\"Recommendations for performance optimizations whe...\", \"abstract\": \"\\\"BACKGROUND\\\\nUse of the Genome Analysis Toolkit (G...\", \"authors\": \"\\\"['Heldenbrand|Jacob R|JR|', 'Baheti|Saurabh|S|', ...\", \"mesh_terms\": \"[\\\"D000465:Algorithms\\\", \\\"D002877:Chromosomes, Human...\", \"year\": \"\\\"2019\\\"\", \"keyphrases\": \"[\\\"GATK\\\", \\\"Genomic variant calling\\\", \\\"Computational...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "authors", "mesh_terms", "year", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "authors": "authors", "mesh_terms": "mesh_terms", "year": "year", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KPBiomed benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kpbiomed"}, "medium": {"config_name": "medium", "sample_row": "{\"id\": \"\\\"28495559\\\"\", \"title\": \"\\\"Influence of adhesive strategy on clinical parame...\", \"abstract\": \"\\\"OBJECTIVES\\\\nWe aimed to answer the following PICO...\", \"authors\": \"\\\"['Schroeder|Marcos|M|', 'Correa|Ivo Carlos|IC|', ...\", \"mesh_terms\": \"[\\\"D000134:Acid Etching, Dental\\\", \\\"D003188:Composit...\", \"year\": \"\\\"2017\\\"\", \"keyphrases\": \"[\\\"Systematic review\\\", \\\"Postoperative sensitivity\\\",...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "authors", "mesh_terms", "year", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "authors": "authors", "mesh_terms": "mesh_terms", "year": "year", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KPBiomed benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kpbiomed"}, "small": {"config_name": "small", "sample_row": "{\"id\": \"\\\"32305274\\\"\", \"title\": \"\\\"A novel tube technique enables visualization of t...\", \"abstract\": \"\\\"BACKGROUND\\\\nPercutaneous pedicle screws(PPS) have...\", \"authors\": \"\\\"['Li|Xu|X|', 'Zhang|Rui|R|', 'Chen|Buzhou|B|', 'D...\", \"mesh_terms\": \"[\\\"D005471:Fluoroscopy\\\", \\\"D050723:Fractures, Bone\\\",...\", \"year\": \"\\\"2020\\\"\", \"keyphrases\": \"[\\\"Jamshidi needles\\\", \\\"Radiation exposure\\\", \\\"15\\\\u00...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"M\\\", \\\"R\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "authors", "mesh_terms", "year", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "authors": "authors", "mesh_terms": "mesh_terms", "year": "year", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KPBiomed benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kpbiomed"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "AmazonScience/mintaka": {"dataset_name": "AmazonScience/mintaka", "description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers.", "downloads": 94, "configs": {"en": {"config_name": "en", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"en\\\"\", \"question\": \"\\\"What is the seventh tallest mountain in North Ame...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Mount Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "ar": {"config_name": "ar", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"ar\\\"\", \"question\": \"\\\"\\\\u0645\\\\u0627 \\\\u0633\\\\u0627\\\\u0628\\\\u0639 \\\\u0623\\\\u063...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": null}]\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"de\\\"\", \"question\": \"\\\"Wie hei\\\\u00dft der siebth\\\\u00f6chste Berg Nordame...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Mount Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "ja": {"config_name": "ja", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"ja\\\"\", \"question\": \"\\\"\\\\u5317\\\\u30a2\\\\u30e1\\\\u30ea\\\\u30ab\\\\u3067\\\\u4e03\\\\u756a\\\\...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"\\\\u30eb\\\\u30ab\\\\u30cb...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"hi\\\"\", \"question\": \"\\\"\\\\u0909\\\\u0924\\\\u094d\\\\u0924\\\\u0930 \\\\u0905\\\\u092e\\\\u0947...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": null}]\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "pt": {"config_name": "pt", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"pt\\\"\", \"question\": \"\\\"Qual \\\\u00e9 a s\\\\u00e9tima montanha mais alta da A...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Monte Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"es\\\"\", \"question\": \"\\\"\\\\u00bfCu\\\\u00e1l es la s\\\\u00e9ptima monta\\\\u00f1a m...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Monte Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "it": {"config_name": "it", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"it\\\"\", \"question\": \"\\\"Qual \\\\u00e8 la settima montagna pi\\\\u00f9 alta del...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Monte Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"fr\\\"\", \"question\": \"\\\"Quelle est la septi\\\\u00e8me plus haute montagne d...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"mont Lucania\\\"}]\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}, "all": {"config_name": "all", "sample_row": "{\"id\": \"\\\"a9011ddf\\\"\", \"lang\": \"\\\"en\\\"\", \"question\": \"\\\"What is the seventh tallest mountain in North Ame...\", \"answerText\": \"\\\"Mount Lucania\\\"\", \"category\": \"\\\"geography\\\"\", \"complexityType\": \"\\\"ordinal\\\"\", \"questionEntity\": \"[{\\\"name\\\": \\\"Q49\\\", \\\"entityType\\\": \\\"entity\\\", \\\"label\\\": ...\", \"answerEntity\": \"[{\\\"name\\\": \\\"Q1153188\\\", \\\"label\\\": \\\"Mount Lucania\\\"}]...\"}", "columns": ["id", "lang", "question", "answerText", "category", "complexityType", "questionEntity", "answerEntity"], "columns_mapping": {"id": "id", "lang": "lang", "question": "question", "answerText": "answerText", "category": "category", "complexityType": "complexityType", "questionEntity": "questionEntity", "answerEntity": "answerEntity"}, "dataset_description": " Mintaka is a complex, natural, and multilingual dataset designed for experimenting with end-to-end\n question-answering models. Mintaka is composed of 20,000 question-answer pairs collected in English,\n annotated with Wikidata entities, and translated into Arabic, French, German, Hindi, Italian,\n Japanese, Portuguese, and Spanish for a total of 180,000 samples.\n Mintaka includes 8 types of complex questions, including superlative, intersection, and multi-hop questions, \n which were naturally elicited from crowd workers. \n", "dataset_name": "AmazonScience/mintaka"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:ar", "multilinguality:de", "multilinguality:ja", "multilinguality:hi", "multilinguality:pt", "multilinguality:en", "multilinguality:es", "multilinguality:it", "multilinguality:fr", "source_datasets:original"], "is_gated": false}, "GEM/TaTA": {"dataset_name": "GEM/TaTA", "description": "Dataset loader for TaTA: A Multilingual Table-to-Text Dataset for African Languages", "downloads": 90, "configs": {"default": {"config_name": "default", "sample_row": "{\"gem_id\": \"\\\"AB20-ar-1\\\"\", \"example_id\": \"\\\"AB20-ar-1\\\"\", \"title\": \"\\\"\\\\u062a\\\\u0645\\\\u0643\\\\u064a\\\\u0646 \\\\u0627\\\\u0644\\\\u0634...\", \"unit_of_measure\": \"\\\"\\\\u0627\\\\u0644\\\\u0646\\\\u0633\\\\u0628\\\\u0629 \\\\u0627\\\\u0644...\", \"chart_type\": \"\\\"Horizontal Bar Chart\\\"\", \"was_translated\": \"\\\"True\\\"\", \"table_data\": \"\\\"[[\\\\\\\"\\\\\\\", \\\\\\\"\\\\\\\\u0645\\\\\\\\u0633\\\\\\\\u062a\\\\\\\\u0648\\\\\\\\u0649 \\\\\\\\u...\", \"linearized_input\": \"\\\"\\\\u062a\\\\u0645\\\\u0643\\\\u064a\\\\u0646 \\\\u0627\\\\u0644\\\\u0634...\", \"table_text\": \"[\\\"\\\\u062a\\\\u062a\\\\u0645\\\\u062a\\\\u0639 13% \\\\u0645\\\\u0646 ...\", \"target\": \"\\\"\\\\u062a\\\\u062a\\\\u0645\\\\u062a\\\\u0639 13% \\\\u0645\\\\u0646 \\\\...\"}", "columns": ["gem_id", "example_id", "title", "unit_of_measure", "chart_type", "was_translated", "table_data", "linearized_input", "table_text", "target"], "columns_mapping": {"gem_id": "gem_id", "example_id": "example_id", "title": "title", "unit_of_measure": "unit_of_measure", "chart_type": "chart_type", "was_translated": "was_translated", "table_data": "table_data", "linearized_input": "linearized_input", "table_text": "table_text", "target": "target"}, "dataset_description": "Dataset loader for TaTA: A Multilingual Table-to-Text Dataset for African Languages\n", "dataset_name": "GEM/TaTA"}}, "tags": ["task_categories:table-to-text", "annotations_creators:none", "multilinguality:yes", "source_datasets:original", "language:ar", "language:en", "language:fr", "language:ha", "language:ig", "language:pt", "language:ru", "language:sw", "language:yo", "data-to-text"], "is_gated": false}, "allenai/csabstruct": {"dataset_name": "allenai/csabstruct", "description": "As a step toward better document-level understanding, we explore classification of a sequence of sentences into their corresponding categories, a task that requires understanding sentences in context of the document. Recent successful models for this task have used hierarchical models to contextualize sentence representations, and Conditional Random Fields (CRFs) to incorporate dependencies between subsequent labels. In this work, we show that pretrained language models, BERT (Devlin et al., 2018) in particular, can be used for this task to capture contextual dependencies without the need for hierarchical encoding nor a CRF. Specifically, we construct a joint sentence representation that allows BERT Transformer layers to directly utilize contextual information from all words in all sentences. Our approach achieves state-of-the-art results on four datasets, including a new dataset of structured scientific abstracts.", "downloads": 79, "configs": {"CSAbstruct": {"config_name": "CSAbstruct", "sample_row": "{\"abstract_id\": \"\\\"train_0000\\\"\", \"sentences\": \"[\\\"Gamification has the potential to improve the qu...\", \"labels\": \"[0, 2, 1, 1, 4, 4, 4, 4, 4]\", \"confs\": \"[0.7778, 0.7778, 0.7778, 1.0, 0.6111, 0.5556, 0.61...\"}", "columns": ["abstract_id", "sentences", "labels", "confs"], "columns_mapping": {"abstract_id": "abstract_id", "sentences": "sentences", "labels": "labels", "confs": "confs"}, "dataset_description": "As a step toward better document-level understanding, we explore classification of a sequence of sentences into their corresponding categories, a task that requires understanding sentences in context of the document. Recent successful models for this task have used hierarchical models to contextualize sentence representations, and Conditional Random Fields (CRFs) to incorporate dependencies between subsequent labels. In this work, we show that pretrained language models, BERT (Devlin et al., 2018) in particular, can be used for this task to capture contextual dependencies without the need for hierarchical encoding nor a CRF. Specifically, we construct a joint sentence representation that allows BERT Transformer layers to directly utilize contextual information from all words in all sentences. Our approach achieves state-of-the-art results on four datasets, including a new dataset of structured scientific abstracts.\n", "dataset_name": "allenai/csabstruct"}}, "tags": [], "is_gated": false}, "sileod/probability_words_nli": {"dataset_name": "sileod/probability_words_nli", "description": "Probing neural language models for understanding of words of estimative probability", "downloads": 30, "configs": {"reasoning_1hop": {"config_name": "reasoning_1hop", "sample_row": "{\"context\": \"\\\"It is probably not the case that Mary is in the s...\", \"hypothesis\": \"\\\"We believe that 'Julius is a frog' or 'Mary is in...\", \"valid_hypothesis\": \"\\\"It is probably not the case that 'Julius is a fro...\", \"invalid_hypothesis\": \"\\\"We believe that 'Julius is a frog' or 'Mary is in...\", \"problog\": \"\\\"\\\\n and(A,B) :- A,B.\\\\n or(A,B) :- A;B.\\\\n nand(A...\", \"probability_word\": \"\\\"probably not\\\"\", \"distractor\": \"\\\"we believe\\\"\", \"hypothesis_assertion\": \"\\\"'Julius is a frog' or 'Mary is in the school' or ...\", \"label\": \"0\", \"idx\": \"0\", \"probability\": \"0.325\"}", "columns": ["context", "hypothesis", "valid_hypothesis", "invalid_hypothesis", "problog", "probability_word", "distractor", "hypothesis_assertion", "label", "idx", "probability"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "valid_hypothesis": "valid_hypothesis", "invalid_hypothesis": "invalid_hypothesis", "problog": "problog", "probability_word": "probability_word", "distractor": "distractor", "hypothesis_assertion": "hypothesis_assertion", "label": "label", "idx": "idx", "probability": "probability"}, "dataset_description": "Probing neural language models for understanding of words of estimative probability\n", "dataset_name": "sileod/probability_words_nli"}, "reasoning_2hop": {"config_name": "reasoning_2hop", "sample_row": "{\"context\": \"\\\"There is almost no chance that Greg is gray. Chan...\", \"hypothesis\": \"\\\"It is highly likely that either 'John discarded t...\", \"valid_hypothesis\": \"\\\"It is unlikely that either 'John discarded the ap...\", \"invalid_hypothesis\": \"\\\"It is highly likely that either 'John discarded t...\", \"problog\": \"\\\"\\\\n and(A,B) :- A,B.\\\\n or(A,B) :- A;B.\\\\n nand(A...\", \"probability_word\": \"\\\"unlikely\\\"\", \"distractor\": \"\\\"highly likely\\\"\", \"hypothesis_assertion\": \"\\\"Either 'John discarded the apple' or 'Sandra got ...\", \"label\": \"0\", \"idx\": \"0\", \"probability\": \"0.18\"}", "columns": ["context", "hypothesis", "valid_hypothesis", "invalid_hypothesis", "problog", "probability_word", "distractor", "hypothesis_assertion", "label", "idx", "probability"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "valid_hypothesis": "valid_hypothesis", "invalid_hypothesis": "invalid_hypothesis", "problog": "problog", "probability_word": "probability_word", "distractor": "distractor", "hypothesis_assertion": "hypothesis_assertion", "label": "label", "idx": "idx", "probability": "probability"}, "dataset_description": "Probing neural language models for understanding of words of estimative probability\n", "dataset_name": "sileod/probability_words_nli"}, "usnli": {"config_name": "usnli", "sample_row": "{\"context\": \"\\\"Woman in white in foreground and a man slightly b...\", \"hypothesis\": \"\\\"We believe that they are working for John 's Pizz...\", \"valid_hypothesis\": \"\\\"We believe that they are working for John 's Pizz...\", \"invalid_hypothesis\": \"\\\"It is improbable that they are working for John '...\", \"probability_word\": \"\\\"we believe\\\"\", \"distractor\": \"\\\"improbable\\\"\", \"hypothesis_assertion\": \"\\\"They are working for John 's Pizza .\\\"\", \"label\": \"1\", \"idx\": \"0\", \"probability\": \"0.7445574122575764\"}", "columns": ["context", "hypothesis", "valid_hypothesis", "invalid_hypothesis", "probability_word", "distractor", "hypothesis_assertion", "label", "idx", "probability"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "valid_hypothesis": "valid_hypothesis", "invalid_hypothesis": "invalid_hypothesis", "probability_word": "probability_word", "distractor": "distractor", "hypothesis_assertion": "hypothesis_assertion", "label": "label", "idx": "idx", "probability": "probability"}, "dataset_description": "Probing neural language models for understanding of words of estimative probability\n", "dataset_name": "sileod/probability_words_nli"}}, "tags": ["task_categories:text-classification", "task_categories:multiple-choice", "task_categories:question-answering", "task_ids:open-domain-qa", "task_ids:multiple-choice-qa", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "wep", "words of estimative probability", "probability", "logical reasoning", "soft logic", "nli", "verbal probabilities", "natural-language-inference", "reasoning", "logic"], "is_gated": false}, "lmqg/qa_squadshifts": {"dataset_name": "lmqg/qa_squadshifts", "description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "downloads": 388, "configs": {"all": {"config_name": "all", "sample_row": "{\"id\": \"\\\"5d66f6322b22cd4dfcfbe7d9\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Gas and electric service is provided by Consolida...\", \"question\": \"\\\"Consolidated Edison can trace it's roots back to ...\", \"answers.text\": \"[\\\"Edison Electric Illuminating Company\\\"]\", \"answers.answer_start\": \"[153]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "amazon": {"config_name": "amazon", "sample_row": "{\"id\": \"\\\"5dd4d824cc027a086d65fde6\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"This cabinet is very easy to assemble. It says &#...\", \"question\": \"\\\"How many people does the reviewer suggest it take...\", \"answers.text\": \"[\\\"It is a one-person job\\\"]\", \"answers.answer_start\": \"[143]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "new_wiki": {"config_name": "new_wiki", "sample_row": "{\"id\": \"\\\"5d66f6322b22cd4dfcfbe7d9\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Gas and electric service is provided by Consolida...\", \"question\": \"\\\"Consolidated Edison can trace it's roots back to ...\", \"answers.text\": \"[\\\"Edison Electric Illuminating Company\\\"]\", \"answers.answer_start\": \"[153]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "nyt": {"config_name": "nyt", "sample_row": "{\"id\": \"\\\"5d704c4ac8e4820a9b66e9f7\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"question\": \"\\\"Ms. Clyne used facsimiles of what possession of E...\", \"answers.text\": \"[\\\"letters\\\"]\", \"answers.answer_start\": \"[214]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}, "reddit": {"config_name": "reddit", "sample_row": "{\"id\": \"\\\"5d9c25298ae5305bc982eff7\\\"\", \"title\": \"\\\"None\\\"\", \"context\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"question\": \"\\\"What is the author's main reason for wanting to h...\", \"answers.text\": \"[\\\"pokegenning/romhacking\\\"]\", \"answers.answer_start\": \"[468]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question answering task with custom split.", "dataset_name": "lmqg/qa_squadshifts"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "multilinguality:monolingual", "source_datasets:extended|wikipedia", "language:en"], "is_gated": false}, "Conrad747/lg-ner": {"dataset_name": "Conrad747/lg-ner", "description": "LugandaPII is a named entity dataset consisting of PERSON, ORG, LOCATION, NORP, USERID and DATE entities.\nThe train/validation/test sets are available for the Luganda language.", "downloads": 15, "configs": {"lug": {"config_name": "lug", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"bassentebe\\\", \\\"be\\\", \\\"##byalo\\\", \\\"balabuddwa\\\", \\\"oku...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "LugandaPII is a named entity dataset consisting of PERSON, ORG, LOCATION, NORP, USERID and DATE entities.\nThe train/validation/test sets are available for the Luganda language.\n", "dataset_name": "Conrad747/lg-ner"}}, "tags": [], "is_gated": false}, "lmqg/qag_tweetqa": {"dataset_name": "lmqg/qag_tweetqa", "description": "Question & answer generation dataset based on [TweetQA](https://huggingface.co/datasets/tweet_qa).", "downloads": 11, "configs": {"qag_tweetqa": {"config_name": "qag_tweetqa", "sample_row": "{\"answers\": \"[\\\"editor\\\", \\\"1991\\\", \\\"ben bradlee\\\", \\\"1994\\\"]\", \"questions\": \"[\\\"what did bradlee retire as?\\\", \\\"when did ben brad...\", \"paragraph\": \"\\\"\\\\\\\"So much of The Post is Ben,\\\\\\\" Mrs. Graham said ...\", \"paragraph_id\": \"\\\"78ac37b757cc7863a0bc39a34e8abe72-50539ee37b16f348...\", \"questions_answers\": \"\\\"question: what did bradlee retire as?, answer: ed...\"}", "columns": ["answers", "questions", "paragraph", "paragraph_id", "questions_answers"], "columns_mapping": {"answers": "answers", "questions": "questions", "paragraph": "paragraph", "paragraph_id": "paragraph_id", "questions_answers": "questions_answers"}, "dataset_description": "Question & answer generation dataset based on [TweetQA](https://huggingface.co/datasets/tweet_qa).", "dataset_name": "lmqg/qag_tweetqa"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:tweet_qa", "language:en", "question-generation"], "is_gated": false}, "lmqg/qag_squad": {"dataset_name": "lmqg/qag_squad", "description": "Question & answer generation dataset based on SQuAD.", "downloads": 13, "configs": {"qag_squad": {"config_name": "qag_squad", "sample_row": "{\"answers\": \"[\\\"4 Minutes\\\", \\\"Elvis Presley\\\", \\\"thirteenth\\\", \\\"Stic...\", \"questions\": \"[\\\"Which single was released as the album's lead si...\", \"paragraph\": \"\\\"\\\\\\\"4 Minutes\\\\\\\" was released as the album's lead si...\", \"questions_answers\": \"\\\"question: Which single was released as the album'...\"}", "columns": ["answers", "questions", "paragraph", "questions_answers"], "columns_mapping": {"answers": "answers", "questions": "questions", "paragraph": "paragraph", "questions_answers": "questions_answers"}, "dataset_description": "Question & answer generation dataset based on SQuAD.", "dataset_name": "lmqg/qag_squad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:lmqg/qg_squad", "language:en", "question-generation"], "is_gated": false}, "lawcompany/KLAID": {"dataset_name": "lawcompany/KLAID", "description": "KLAID (Korean Legal Artificial Intelligence Datasets) is a dataset for the development of Korean legal artificial intelligence technology. This time we offer 1 task, which is legal judgment prediction(LJP).", "downloads": 33, "configs": {"ljp": {"config_name": "ljp", "sample_row": "{\"laws_service_id\": \"32\", \"fact\": \"\\\"\\\\ud53c\\\\uace0\\\\uc778\\\\uc740 2018. 8. 9. 23:33\\\\uacbd ...\", \"laws_service\": \"\\\"\\\\ub3c4\\\\ub85c\\\\uad50\\\\ud1b5\\\\ubc95 \\\\uc81c148\\\\uc870\\\\uc...\"}", "columns": ["laws_service_id", "fact", "laws_service"], "columns_mapping": {"laws_service_id": "laws_service_id", "fact": "fact", "laws_service": "laws_service"}, "dataset_description": "KLAID (Korean Legal Artificial Intelligence Datasets) is a dataset for the development of Korean legal artificial intelligence technology. This time we offer 1 task, which is legal judgment prediction(LJP).\n", "dataset_name": "lawcompany/KLAID"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "multilinguality:monolingual", "language:ko"], "is_gated": false}, "bigbio/bc7_litcovid": {"dataset_name": "bigbio/bc7_litcovid", "description": "The training and development datasets contain the publicly-available text of over 30 thousand COVID-19-related articles and their metadata (e.g., title, abstract, journal). Articles in both datasets have been manually reviewed and articles annotated by in-house models.", "downloads": 103, "configs": {"bc7_litcovid_source": {"config_name": "bc7_litcovid_source", "sample_row": "{\"pmid\": \"\\\"32519164\\\"\", \"journal\": \"\\\"J Thromb Thrombolysis\\\"\", \"title\": \"\\\"Potential role for tissue factor in the pathogene...\", \"abstract\": \"\\\"In December 2019, a new and highly contagious inf...\", \"keywords\": \"[\\\"covid-19\\\", \\\"il-6\\\", \\\"sars-cov-2\\\", \\\"tnf-alpha\\\", \\\"t...\", \"pub_type\": \"[\\\"Journal Article\\\", \\\"Review\\\"]\", \"authors\": \"[\\\"Bautista-Vargas, Mario\\\", \\\"Bonilla-Abadia, Fabio\\\"...\", \"doi\": \"\\\"10.1007/s11239-020-02172-x\\\"\", \"labels\": \"[1, 3]\"}", "columns": ["pmid", "journal", "title", "abstract", "keywords", "pub_type", "authors", "doi", "labels"], "columns_mapping": {"pmid": "pmid", "journal": "journal", "title": "title", "abstract": "abstract", "keywords": "keywords", "pub_type": "pub_type", "authors": "authors", "doi": "doi", "labels": "labels"}, "dataset_description": "The training and development datasets contain the publicly-available text of over 30 thousand COVID-19-related articles and their metadata (e.g., title, abstract, journal). Articles in both datasets have been manually reviewed and articles annotated by in-house models.\n", "dataset_name": "bigbio/bc7_litcovid"}, "bc7_litcovid_bigbio_text": {"config_name": "bc7_litcovid_bigbio_text", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"32519164\\\"\", \"text\": \"\\\"In December 2019, a new and highly contagious inf...\", \"labels\": \"[\\\"Treatment\\\", \\\"Mechanism\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "The training and development datasets contain the publicly-available text of over 30 thousand COVID-19-related articles and their metadata (e.g., title, abstract, journal). Articles in both datasets have been manually reviewed and articles annotated by in-house models.\n", "dataset_name": "bigbio/bc7_litcovid"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bioinfer": {"dataset_name": "bigbio/bioinfer", "description": "A corpus targeted at protein, gene, and RNA relationships which serves as a\nresource for the development of information extraction systems and their\ncomponents such as parsers and domain analyzers. Currently, the corpus contains\n1100 sentences from abstracts of biomedical research articles annotated for\nrelationships, named entities, as well as syntactic dependencies.", "downloads": 49, "configs": {"bioinfer_source": {"config_name": "bioinfer_source", "sample_row": "{\"document_id\": \"\\\"BioInfer.d0.s0\\\"\", \"type\": \"\\\"Sentence\\\"\", \"text\": \"\\\"alpha-catenin inhibits beta-catenin signaling by ...\", \"entities\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.e0\\\", \\\"offsets\\\": [[88, 101]...\", \"relations\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.i0\\\", \\\"type\\\": \\\"PPI\\\", \\\"arg1_...\"}", "columns": ["document_id", "type", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "type": "type", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "A corpus targeted at protein, gene, and RNA relationships which serves as a\nresource for the development of information extraction systems and their\ncomponents such as parsers and domain analyzers. Currently, the corpus contains\n1100 sentences from abstracts of biomedical research articles annotated for\nrelationships, named entities, as well as syntactic dependencies.\n", "dataset_name": "bigbio/bioinfer"}, "bioinfer_bigbio_kb": {"config_name": "bioinfer_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BioInfer.d0.s0\\\"\", \"passages\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0__text\\\", \\\"type\\\": \\\"Sentence\\\"...\", \"entities\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.e0\\\", \\\"type\\\": \\\"Individual_p...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"BioInfer.d0.s0.i0\\\", \\\"type\\\": \\\"PPI\\\", \\\"arg1_...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "A corpus targeted at protein, gene, and RNA relationships which serves as a\nresource for the development of information extraction systems and their\ncomponents such as parsers and domain analyzers. Currently, the corpus contains\n1100 sentences from abstracts of biomedical research articles annotated for\nrelationships, named entities, as well as syntactic dependencies.\n", "dataset_name": "bigbio/bioinfer"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/biology_how_why_corpus": {"dataset_name": "bigbio/biology_how_why_corpus", "description": "This dataset consists of 185 \"how\" and 193 \"why\" biology questions authored by a domain expert, with one or more gold \nanswer passages identified in an undergraduate textbook. The expert was not constrained in any way during the \nannotation process, so gold answers might be smaller than a paragraph or span multiple paragraphs. This dataset was \nused for the question-answering system described in the paper \u201cDiscourse Complements Lexical Semantics for Non-factoid \nAnswer Reranking\u201d (ACL 2014).", "downloads": 29, "configs": {"biology_how_why_corpus_source": {"config_name": "biology_how_why_corpus_source", "sample_row": "{\"text\": \"\\\"How does the second law of thermodynamics apply t...\", \"type\": \"\\\"how\\\"\", \"answers\": \"[{\\\"justification\\\": \\\"The second law of thermodynami...\"}", "columns": ["text", "type", "answers"], "columns_mapping": {"text": "text", "type": "type", "answers": "answers"}, "dataset_description": "This dataset consists of 185 \"how\" and 193 \"why\" biology questions authored by a domain expert, with one or more gold \nanswer passages identified in an undergraduate textbook. The expert was not constrained in any way during the \nannotation process, so gold answers might be smaller than a paragraph or span multiple paragraphs. This dataset was \nused for the question-answering system described in the paper \u201cDiscourse Complements Lexical Semantics for Non-factoid \nAnswer Reranking\u201d (ACL 2014).\n", "dataset_name": "bigbio/biology_how_why_corpus"}, "biology_how_why_corpus_bigbio_qa": {"config_name": "biology_how_why_corpus_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"1_8_6\\\"\", \"question\": \"\\\"How does the second law of thermodynamics apply t...\", \"type\": \"\\\"how\\\"\", \"choices\": \"[]\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"The second law of thermodynamics states that spo...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "This dataset consists of 185 \"how\" and 193 \"why\" biology questions authored by a domain expert, with one or more gold \nanswer passages identified in an undergraduate textbook. The expert was not constrained in any way during the \nannotation process, so gold answers might be smaller than a paragraph or span multiple paragraphs. This dataset was \nused for the question-answering system described in the paper \u201cDiscourse Complements Lexical Semantics for Non-factoid \nAnswer Reranking\u201d (ACL 2014).\n", "dataset_name": "bigbio/biology_how_why_corpus"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/biomrc": {"dataset_name": "bigbio/biomrc", "description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.", "downloads": 12, "configs": {"biomrc_large_A_source": {"config_name": "biomrc_large_A_source", "sample_row": "{\"abstract\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"title\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"('9606', 'Species')\\\", \\\"('MESH:D001943', 'Disease...\", \"entities_list.synonyms\": \"[\\\"['patients']\\\", \\\"['breast than lung cancer', 'bre...\", \"answer.pseudoidentifier\": \"\\\"@entity0\\\"\", \"answer.identifier\": \"\\\"(MESH:D001943,Disease)\\\"\", \"answer.synonyms\": \"\\\"['breast and lung cancer']\\\\n\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_large_A_bigbio_qa": {"config_name": "biomrc_large_A_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"context\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"answer\": \"[\\\"@entity0\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_A_source": {"config_name": "biomrc_small_A_source", "sample_row": "{\"abstract\": \"\\\"Single-agent activity for @entity8253 reflected b...\", \"title\": \"\\\"No synergistic activity of @entity1259 and XXXX i...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity632\\\", \\\"@entity137\\\", \\\"@entity4...\", \"entities_list.identifier\": \"[\\\"('9606', 'Species')\\\", \\\"('MESH:D004317', 'Chemica...\", \"entities_list.synonyms\": \"[\\\"['patients', 'patient']\\\", \\\"['Adriamycin']\\\", \\\"['t...\", \"answer.pseudoidentifier\": \"\\\"@entity4020\\\"\", \"answer.identifier\": \"\\\"(3440,Gene)\\\"\", \"answer.synonyms\": \"\\\"['interferon-alpha 2b']\\\\n\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_A_bigbio_qa": {"config_name": "biomrc_small_A_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"No synergistic activity of @entity1259 and XXXX i...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity632\\\", \\\"@entity137\\\", \\\"@entity4...\", \"context\": \"\\\"Single-agent activity for @entity8253 reflected b...\", \"answer\": \"[\\\"@entity4020\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_A_source": {"config_name": "biomrc_tiny_A_source", "sample_row": "{\"abstract\": \"\\\"BACKGROUND: Research into the optimal treatment o...\", \"title\": \"\\\"Radiographic classification and treatment of XXXX...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity85\\\", \\\"@entity82\\\", \\\"@entity319...\", \"entities_list.identifier\": \"[\\\"('9606', 'Species')\\\", \\\"('MESH:D009140', 'Disease...\", \"entities_list.synonyms\": \"[\\\"['patients']\\\", \\\"['valgus deformity', 'angular de...\", \"answer.pseudoidentifier\": \"\\\"@entity82\\\"\", \"answer.identifier\": \"\\\"(MESH:D005355,Disease)\\\"\", \"answer.synonyms\": \"\\\"['fibrous dysplasia']\\\\n\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_A_bigbio_qa": {"config_name": "biomrc_tiny_A_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Radiographic classification and treatment of XXXX...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity85\\\", \\\"@entity82\\\", \\\"@entity319...\", \"context\": \"\\\"BACKGROUND: Research into the optimal treatment o...\", \"answer\": \"[\\\"@entity82\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_large_B_source": {"config_name": "biomrc_large_B_source", "sample_row": "{\"abstract\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"title\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"entities_list.pseudoidentifier\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"entities_list.synonyms\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"answer.pseudoidentifier\": \"\\\"@entity0\\\"\", \"answer.identifier\": \"\\\"\\\"\", \"answer.synonyms\": \"\\\"\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_large_B_bigbio_qa": {"config_name": "biomrc_large_B_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Attributes of brain metastases from XXXX .\\\\n\\\"\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity1\\\", \\\"@entity0\\\", \\\"@entity5\\\", \\\"@entity4\\\", \\\"...\", \"context\": \"\\\"BACKGROUND: Most brain metastases arise from @ent...\", \"answer\": \"[\\\"@entity0\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_B_source": {"config_name": "biomrc_small_B_source", "sample_row": "{\"abstract\": \"\\\"Single-agent activity for @entity12 reflected by ...\", \"title\": \"\\\"No synergistic activity of @entity7 and XXXX in t...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity0\\\", \\\"@entity6\\\", \\\"@entity2\\\", \\\"@entity5\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"...\", \"entities_list.synonyms\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"...\", \"answer.pseudoidentifier\": \"\\\"@entity10\\\"\", \"answer.identifier\": \"\\\"\\\"\", \"answer.synonyms\": \"\\\"\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_small_B_bigbio_qa": {"config_name": "biomrc_small_B_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"No synergistic activity of @entity7 and XXXX in t...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity0\\\", \\\"@entity6\\\", \\\"@entity2\\\", \\\"@entity5\\\", \\\"...\", \"context\": \"\\\"Single-agent activity for @entity12 reflected by ...\", \"answer\": \"[\\\"@entity10\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_B_source": {"config_name": "biomrc_tiny_B_source", "sample_row": "{\"abstract\": \"\\\"@entity3 ( @entity2 ) has been increasingly recog...\", \"title\": \"\\\"Breast-fed @entity0 achieve a higher rate of brai...\", \"entities_list.pseudoidentifier\": \"[\\\"@entity0\\\", \\\"@entity3\\\", \\\"@entity1\\\", \\\"@entity2\\\", \\\"...\", \"entities_list.identifier\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"entities_list.synonyms\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"answer.pseudoidentifier\": \"\\\"@entity3\\\"\", \"answer.identifier\": \"\\\"\\\"\", \"answer.synonyms\": \"\\\"\\\"\"}", "columns": ["abstract", "title", "entities_list_pseudoidentifier", "entities_list_identifier", "entities_list_synonyms", "answer_pseudoidentifier", "answer_identifier", "answer_synonyms"], "columns_mapping": {"abstract": "abstract", "title": "title", "entities_list.pseudoidentifier": "entities_list_pseudoidentifier", "entities_list.identifier": "entities_list_identifier", "entities_list.synonyms": "entities_list_synonyms", "answer.pseudoidentifier": "answer_pseudoidentifier", "answer.identifier": "answer_identifier", "answer.synonyms": "answer_synonyms"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}, "biomrc_tiny_B_bigbio_qa": {"config_name": "biomrc_tiny_B_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"1\\\"\", \"document_id\": \"\\\"2\\\"\", \"question\": \"\\\"Breast-fed @entity0 achieve a higher rate of brai...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"@entity0\\\", \\\"@entity3\\\", \\\"@entity1\\\", \\\"@entity2\\\", \\\"...\", \"context\": \"\\\"@entity3 ( @entity2 ) has been increasingly recog...\", \"answer\": \"[\\\"@entity3\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "We introduce BIOMRC, a large-scale cloze-style biomedical MRC dataset. Care was taken to reduce noise, compared to the\nprevious BIOREAD dataset of Pappas et al. (2018). Experiments show that simple heuristics do not perform well on the\nnew dataset and that two neural MRC models that had been tested on BIOREAD perform much better on BIOMRC, indicating\nthat the new dataset is indeed less noisy or at least that its task is more feasible. Non-expert human performance is\nalso higher on the new dataset compared to BIOREAD, and biomedical experts perform even better. We also introduce a new\nBERT-based MRC model, the best version of which substantially outperforms all other methods tested, reaching or\nsurpassing the accuracy of biomedical experts in some experiments. We make the new dataset available in three different\nsizes, also releasing our code, and providing a leaderboard.\n", "dataset_name": "bigbio/biomrc"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2011_epi": {"dataset_name": "bigbio/bionlp_st_2011_epi", "description": "The dataset of the Epigenetics and Post-translational Modifications (EPI) task\nof BioNLP Shared Task 2011.", "downloads": 61, "configs": {"bionlp_st_2011_epi_source": {"config_name": "bionlp_st_2011_epi_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10190553\\\"\", \"text\": \"\\\"Regulation of connexin32 and connexin43 gene expr...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[14, ...\", \"events\": \"[{\\\"trigger\\\": \\\"T26\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"DNA_methy...\", \"relations\": \"[]\", \"equivalences\": \"[{\\\"id\\\": \\\"*\\\", \\\"ref_ids\\\": [\\\"T3\\\", \\\"T4\\\"]}, {\\\"id\\\": \\\"*\\\",...\", \"attributes\": \"[{\\\"id\\\": \\\"M1\\\", \\\"type\\\": \\\"Negation\\\", \\\"ref_id\\\": \\\"E3\\\", ...\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The dataset of the Epigenetics and Post-translational Modifications (EPI) task\nof BioNLP Shared Task 2011.\n", "dataset_name": "bigbio/bionlp_st_2011_epi"}, "bionlp_st_2011_epi_bigbio_kb": {"config_name": "bionlp_st_2011_epi_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10190553\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMID-10190553__text\\\", \\\"type\\\": \\\"abstract\\\",...\", \"entities\": \"[{\\\"id\\\": \\\"PMID-10190553_T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"of...\", \"events\": \"[{\\\"id\\\": \\\"PMID-10190553_E1\\\", \\\"type\\\": \\\"DNA_methylati...\", \"coreferences\": \"[{\\\"id\\\": \\\"PMID-10190553_1\\\", \\\"entity_ids\\\": [\\\"PMID-10...\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The dataset of the Epigenetics and Post-translational Modifications (EPI) task\nof BioNLP Shared Task 2011.\n", "dataset_name": "bigbio/bionlp_st_2011_epi"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2011_ge": {"dataset_name": "bigbio/bionlp_st_2011_ge", "description": "The BioNLP-ST GE task has been promoting development of fine-grained information extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of NFkB as a model domain of Biomedical IE.\nThe GENIA task aims at extracting events occurring upon genes or gene products, which are typed as \"Protein\"\nwithout differentiating genes from gene products. Other types of physical entities, e.g. cells, cell components,\nare not differentiated from each other, and their type is given as \"Entity\".", "downloads": 17, "configs": {"bionlp_st_2011_ge_source": {"config_name": "bionlp_st_2011_ge_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMC-1310901-00-TIAB\\\"\", \"text\": \"\\\"Down-regulation of interferon regulatory factor 4...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[19, ...\", \"events\": \"[{\\\"trigger\\\": \\\"T18\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"Negative_...\", \"relations\": \"[]\", \"equivalences\": \"[{\\\"id\\\": \\\"*\\\", \\\"ref_ids\\\": [\\\"T4\\\", \\\"T5\\\"]}]\", \"attributes\": \"[{\\\"id\\\": \\\"M1\\\", \\\"type\\\": \\\"Negation\\\", \\\"ref_id\\\": \\\"E3\\\", ...\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The BioNLP-ST GE task has been promoting development of fine-grained information extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of NFkB as a model domain of Biomedical IE.\nThe GENIA task aims at extracting events occurring upon genes or gene products, which are typed as \"Protein\"\nwithout differentiating genes from gene products. Other types of physical entities, e.g. cells, cell components,\nare not differentiated from each other, and their type is given as \"Entity\".\n", "dataset_name": "bigbio/bionlp_st_2011_ge"}, "bionlp_st_2011_ge_bigbio_kb": {"config_name": "bionlp_st_2011_ge_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMC-1310901-00-TIAB\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB__text\\\", \\\"type\\\": \\\"abst...\", \"entities\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB_T1\\\", \\\"type\\\": \\\"Protein...\", \"events\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB_E1\\\", \\\"type\\\": \\\"Negativ...\", \"coreferences\": \"[{\\\"id\\\": \\\"PMC-1310901-00-TIAB_1\\\", \\\"entity_ids\\\": [\\\"P...\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The BioNLP-ST GE task has been promoting development of fine-grained information extraction (IE) from biomedical\ndocuments, since 2009. Particularly, it has focused on the domain of NFkB as a model domain of Biomedical IE.\nThe GENIA task aims at extracting events occurring upon genes or gene products, which are typed as \"Protein\"\nwithout differentiating genes from gene products. Other types of physical entities, e.g. cells, cell components,\nare not differentiated from each other, and their type is given as \"Entity\".\n", "dataset_name": "bigbio/bionlp_st_2011_ge"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2013_gro": {"dataset_name": "bigbio/bionlp_st_2013_gro", "description": "GRO Task: Populating the Gene Regulation Ontology with events and\nrelations. A data set from the bio NLP shared tasks competition from 2013", "downloads": 20, "configs": {"bionlp_st_2013_gro_source": {"config_name": "bionlp_st_2013_gro_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10025957\\\"\", \"text\": \"\\\"UCP4, a novel brain-specific mitochondrial protei...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[0, 4...\", \"events\": \"[{\\\"trigger\\\": \\\"T11\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"CellularP...\", \"relations\": \"[{\\\"id\\\": \\\"R3\\\", \\\"type\\\": \\\"locatedIn\\\", \\\"head\\\": {\\\"role\\\"...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "GRO Task: Populating the Gene Regulation Ontology with events and\nrelations. A data set from the bio NLP shared tasks competition from 2013\n", "dataset_name": "bigbio/bionlp_st_2013_gro"}, "bionlp_st_2013_gro_bigbio_kb": {"config_name": "bionlp_st_2013_gro_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10025957\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMID-10025957__text\\\", \\\"type\\\": \\\"abstract\\\",...\", \"entities\": \"[{\\\"id\\\": \\\"PMID-10025957_T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"of...\", \"events\": \"[{\\\"id\\\": \\\"PMID-10025957_E1\\\", \\\"type\\\": \\\"CellularProce...\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"arg1_id\\\": \\\"PMID-10025957_T4\\\", \\\"arg2_id\\\": \\\"PMID-...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "GRO Task: Populating the Gene Regulation Ontology with events and\nrelations. A data set from the bio NLP shared tasks competition from 2013\n", "dataset_name": "bigbio/bionlp_st_2013_gro"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2013_pc": {"dataset_name": "bigbio/bionlp_st_2013_pc", "description": "the Pathway Curation (PC) task is a main event extraction task of the BioNLP shared task (ST) 2013.\nThe PC task concerns the automatic extraction of biomolecular reactions from text.\nThe task setting, representation and semantics are defined with respect to pathway\nmodel standards and ontologies (SBML, BioPAX, SBO) and documents selected by relevance\nto specific model reactions. Two BioNLP ST 2013 participants successfully completed\nthe PC task. The highest achieved F-score, 52.8%, indicates that event extraction is\na promising approach to supporting pathway curation efforts.", "downloads": 59, "configs": {"bionlp_st_2013_pc_source": {"config_name": "bionlp_st_2013_pc_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10085159\\\"\", \"text\": \"\\\"The Cdc6 protein is ubiquitinated in vivo for pro...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Gene_or_gene_product\\\", \\\"off...\", \"events\": \"[{\\\"trigger\\\": \\\"T15\\\", \\\"id\\\": \\\"E1\\\", \\\"type\\\": \\\"Ubiquitin...\", \"relations\": \"[]\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "the Pathway Curation (PC) task is a main event extraction task of the BioNLP shared task (ST) 2013.\nThe PC task concerns the automatic extraction of biomolecular reactions from text.\nThe task setting, representation and semantics are defined with respect to pathway\nmodel standards and ontologies (SBML, BioPAX, SBO) and documents selected by relevance\nto specific model reactions. Two BioNLP ST 2013 participants successfully completed\nthe PC task. The highest achieved F-score, 52.8%, indicates that event extraction is\na promising approach to supporting pathway curation efforts.\n", "dataset_name": "bigbio/bionlp_st_2013_pc"}, "bionlp_st_2013_pc_bigbio_kb": {"config_name": "bionlp_st_2013_pc_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"PMID-10085159\\\"\", \"passages\": \"[{\\\"id\\\": \\\"PMID-10085159__text\\\", \\\"type\\\": \\\"abstract\\\",...\", \"entities\": \"[{\\\"id\\\": \\\"PMID-10085159_T1\\\", \\\"type\\\": \\\"Gene_or_gene_...\", \"events\": \"[{\\\"id\\\": \\\"PMID-10085159_E1\\\", \\\"type\\\": \\\"Ubiquitinatio...\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "the Pathway Curation (PC) task is a main event extraction task of the BioNLP shared task (ST) 2013.\nThe PC task concerns the automatic extraction of biomolecular reactions from text.\nThe task setting, representation and semantics are defined with respect to pathway\nmodel standards and ontologies (SBML, BioPAX, SBO) and documents selected by relevance\nto specific model reactions. Two BioNLP ST 2013 participants successfully completed\nthe PC task. The highest achieved F-score, 52.8%, indicates that event extraction is\na promising approach to supporting pathway curation efforts.\n", "dataset_name": "bigbio/bionlp_st_2013_pc"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/bionlp_st_2019_bb": {"dataset_name": "bigbio/bionlp_st_2019_bb", "description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.", "downloads": 107, "configs": {"bionlp_st_2019_bb_norm_source": {"config_name": "bionlp_st_2019_bb_norm_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-norm-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[]\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_norm+ner_source": {"config_name": "bionlp_st_2019_bb_norm+ner_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-norm+ner-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[]\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_rel_source": {"config_name": "bionlp_st_2019_bb_rel_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-rel-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_rel+ner_source": {"config_name": "bionlp_st_2019_bb_rel+ner_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-rel+ner-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[]\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_kb_source": {"config_name": "bionlp_st_2019_bb_kb_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-kb-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_kb+ner_source": {"config_name": "bionlp_st_2019_bb_kb+ner_source", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-kb+ner-1016123\\\"\", \"text\": \"\\\"An evaluation of selective broths based on the bi...\", \"text_bound_annotations\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Habitat\\\", \\\"offsets\\\": [[17, ...\", \"events\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Lives_In\\\", \\\"head\\\": {\\\"role\\\":...\", \"equivalences\": \"[]\", \"attributes\": \"[]\", \"normalizations\": \"[{\\\"id\\\": \\\"N1\\\", \\\"ref_id\\\": \\\"T3\\\", \\\"resource_name\\\": \\\"On...\"}", "columns": ["id", "document_id", "text", "text_bound_annotations", "events", "relations", "equivalences", "attributes", "normalizations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_bound_annotations": "text_bound_annotations", "events": "events", "relations": "relations", "equivalences": "equivalences", "attributes": "attributes", "normalizations": "normalizations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}, "bionlp_st_2019_bb_bigbio_kb": {"config_name": "bionlp_st_2019_bb_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"BB-kb+ner-1016123\\\"\", \"passages\": \"[{\\\"id\\\": \\\"BB-kb+ner-1016123__text\\\", \\\"type\\\": \\\"abstra...\", \"entities\": \"[{\\\"id\\\": \\\"BB-kb+ner-1016123_T3\\\", \\\"type\\\": \\\"Habitat\\\",...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"arg1_id\\\": \\\"BB-kb+ner-1016123_T5\\\", \\\"arg2_id\\\": \\\"B...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The task focuses on the extraction of the locations and phenotypes of\nmicroorganisms from PubMed abstracts and full-text excerpts, and the\ncharacterization of these entities with respect to reference knowledge\nsources (NCBI taxonomy, OntoBiotope ontology). The task is motivated by\nthe importance of the knowledge on biodiversity for fundamental research\nand applications in microbiology.\n\n", "dataset_name": "bigbio/bionlp_st_2019_bb"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/biorelex": {"dataset_name": "bigbio/biorelex", "description": "BioRelEx is a biological relation extraction dataset. Version 1.0 contains 2010\nannotated sentences that describe binding interactions between various\nbiological entities (proteins, chemicals, etc.). 1405 sentences are for\ntraining, another 201 sentences are for validation. They are publicly available\nat https://github.com/YerevaNN/BioRelEx/releases. Another 404 sentences are for\ntesting which are kept private for at this Codalab competition\nhttps://competitions.codalab.org/competitions/20468. All sentences contain words\n\"bind\", \"bound\" or \"binding\". For every sentence we provide: 1) Complete\nannotations of all biological entities that appear in the sentence 2) Entity\ntypes (32 types) and grounding information for most of the proteins and families\n(links to uniprot, interpro and other databases) 3) Coreference between entities\nin the same sentence (e.g. abbreviations and synonyms) 4) Binding interactions\nbetween the annotated entities 5) Binding interaction types: positive, negative\n(A does not bind B) and neutral (A may bind to B)", "downloads": 68, "configs": {"biorelex_source": {"config_name": "biorelex_source", "sample_row": "{\"paperid\": \"\\\"24813911\\\"\", \"interactions\": \"[{\\\"participants\\\": [0, 2], \\\"type\\\": \\\"bind\\\", \\\"implici...\", \"url\": \"\\\"http://molpharm.aspetjournals.org/content/53/6/10...\", \"text\": \"\\\"Moreover, the in vitro binding of NF-\\\\u03baB or S...\", \"entities\": \"[{\\\"is_state\\\": false, \\\"label\\\": \\\"DNA\\\", \\\"names\\\": [{\\\"i...\", \"_line_\": \"7\", \"id\": \"\\\"1.0alpha7.train.0\\\"\"}", "columns": ["paperid", "interactions", "url", "text", "entities", "_line_", "id"], "columns_mapping": {"paperid": "paperid", "interactions": "interactions", "url": "url", "text": "text", "entities": "entities", "_line_": "_line_", "id": "id"}, "dataset_description": "BioRelEx is a biological relation extraction dataset. Version 1.0 contains 2010\nannotated sentences that describe binding interactions between various\nbiological entities (proteins, chemicals, etc.). 1405 sentences are for\ntraining, another 201 sentences are for validation. They are publicly available\nat https://github.com/YerevaNN/BioRelEx/releases. Another 404 sentences are for\ntesting which are kept private for at this Codalab competition\nhttps://competitions.codalab.org/competitions/20468. All sentences contain words\n\"bind\", \"bound\" or \"binding\". For every sentence we provide: 1) Complete\nannotations of all biological entities that appear in the sentence 2) Entity\ntypes (32 types) and grounding information for most of the proteins and families\n(links to uniprot, interpro and other databases) 3) Coreference between entities\nin the same sentence (e.g. abbreviations and synonyms) 4) Binding interactions\nbetween the annotated entities 5) Binding interaction types: positive, negative\n(A does not bind B) and neutral (A may bind to B)", "dataset_name": "bigbio/biorelex"}, "biorelex_bigbio_kb": {"config_name": "biorelex_bigbio_kb", "sample_row": "{\"id\": \"\\\"1.0alpha7.train.0\\\"\", \"document_id\": \"\\\"24813911\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.sent\\\", \\\"type\\\": \\\"sentenc...\", \"entities\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.ent0_0\\\", \\\"type\\\": \\\"DNA\\\",...\", \"events\": \"[]\", \"coreferences\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.coref0\\\", \\\"entity_ids\\\": ...\", \"relations\": \"[{\\\"id\\\": \\\"1.0alpha7.train.0.rel0s1.0alpha7.train.0....\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "BioRelEx is a biological relation extraction dataset. Version 1.0 contains 2010\nannotated sentences that describe binding interactions between various\nbiological entities (proteins, chemicals, etc.). 1405 sentences are for\ntraining, another 201 sentences are for validation. They are publicly available\nat https://github.com/YerevaNN/BioRelEx/releases. Another 404 sentences are for\ntesting which are kept private for at this Codalab competition\nhttps://competitions.codalab.org/competitions/20468. All sentences contain words\n\"bind\", \"bound\" or \"binding\". For every sentence we provide: 1) Complete\nannotations of all biological entities that appear in the sentence 2) Entity\ntypes (32 types) and grounding information for most of the proteins and families\n(links to uniprot, interpro and other databases) 3) Coreference between entities\nin the same sentence (e.g. abbreviations and synonyms) 4) Binding interactions\nbetween the annotated entities 5) Binding interaction types: positive, negative\n(A does not bind B) and neutral (A may bind to B)", "dataset_name": "bigbio/biorelex"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/chebi_nactem": {"dataset_name": "bigbio/chebi_nactem", "description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.", "downloads": 120, "configs": {"chebi_nactem_abstr_ann1_source": {"config_name": "chebi_nactem_abstr_ann1_source", "sample_row": "{\"document_id\": \"\\\"10026165\\\"\", \"text\": \"\\\"3,4-Dihydroxyphenylalanine (Dopa) decarboxylase i...\", \"entities\": \"[{\\\"id\\\": \\\"T4\\\", \\\"type\\\": \\\"Protein\\\", \\\"text\\\": \\\"['3,4-Di...\", \"relations\": \"[{\\\"id\\\": \\\"R4\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1\\\": \\\"...\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_abstr_ann1_bigbio_kb": {"config_name": "chebi_nactem_abstr_ann1_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"10026165\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"3,4-Dihydroxyph...\", \"entities\": \"[{\\\"id\\\": \\\"1_T4\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[0,...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"1_R4\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1_i...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_abstr_ann2_source": {"config_name": "chebi_nactem_abstr_ann2_source", "sample_row": "{\"document_id\": \"\\\"10026165\\\"\", \"text\": \"\\\"3,4-Dihydroxyphenylalanine (Dopa) decarboxylase i...\", \"entities\": \"[{\\\"id\\\": \\\"T3\\\", \\\"type\\\": \\\"Protein\\\", \\\"text\\\": \\\"['3,4-Di...\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1\\\": \\\"...\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_abstr_ann2_bigbio_kb": {"config_name": "chebi_nactem_abstr_ann2_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"10026165\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"3,4-Dihydroxyph...\", \"entities\": \"[{\\\"id\\\": \\\"1_T3\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[0,...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"1_R1\\\", \\\"type\\\": \\\"Associated_With\\\", \\\"arg1_i...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_fullpaper_source": {"config_name": "chebi_nactem_fullpaper_source", "sample_row": "{\"document_id\": \"\\\"10023770\\\"\", \"text\": \"\\\" The dogma of exclusive T cell recognition of pep...\", \"entities\": \"[{\\\"id\\\": \\\"T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"text\\\": \\\"['major ...\", \"relations\": \"[{\\\"id\\\": \\\"R1\\\", \\\"type\\\": \\\"Binds_With\\\", \\\"arg1\\\": \\\"T156\\\"...\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}, "chebi_nactem_fullpaper_bigbio_kb": {"config_name": "chebi_nactem_fullpaper_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"10023770\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\" The dogma of e...\", \"entities\": \"[{\\\"id\\\": \\\"1_T1\\\", \\\"type\\\": \\\"Protein\\\", \\\"offsets\\\": [[54...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"1_R1\\\", \\\"type\\\": \\\"Binds_With\\\", \\\"arg1_id\\\": \\\"...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The ChEBI corpus contains 199 annotated abstracts and 100 annotated full papers.\nAll documents in the corpus have been annotated for named entities and relations\nbetween these. In total, our corpus provides over 15000 named entity annotations\nand over 6,000 relations between entities.\n", "dataset_name": "bigbio/chebi_nactem"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/chemprot": {"dataset_name": "bigbio/chemprot", "description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.", "downloads": 454, "configs": {"chemprot_full_source": {"config_name": "chemprot_full_source", "sample_row": "{\"pmid\": \"\\\"16357751\\\"\", \"text\": \"\\\"Selective costimulation modulators: a novel appro...\", \"entities.id\": \"[\\\"T1\\\", \\\"T2\\\", \\\"T3\\\", \\\"T4\\\", \\\"T5\\\"]\", \"entities.type\": \"[\\\"CHEMICAL\\\", \\\"GENE-N\\\", \\\"GENE-Y\\\", \\\"GENE-Y\\\", \\\"GENE-N...\", \"entities.text\": \"[\\\"methotrexate\\\", \\\"tumor necrosis factor\\\", \\\"CD80\\\", ...\", \"entities.offsets\": \"[[1342, 1354], [1364, 1385], [805, 809], [810, 814...\", \"relations.type\": \"[]\", \"relations.arg1\": \"[]\", \"relations.arg2\": \"[]\"}", "columns": ["pmid", "text", "entities_id", "entities_type", "entities_text", "entities_offsets", "relations_type", "relations_arg1", "relations_arg2"], "columns_mapping": {"pmid": "pmid", "text": "text", "entities.id": "entities_id", "entities.type": "entities_type", "entities.text": "entities_text", "entities.offsets": "entities_offsets", "relations.type": "relations_type", "relations.arg1": "relations_arg1", "relations.arg2": "relations_arg2"}, "dataset_description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.\n", "dataset_name": "bigbio/chemprot"}, "chemprot_shared_task_eval_source": {"config_name": "chemprot_shared_task_eval_source", "sample_row": "{\"pmid\": \"\\\"16357751\\\"\", \"text\": \"\\\"Selective costimulation modulators: a novel appro...\", \"entities.id\": \"[\\\"T1\\\", \\\"T2\\\", \\\"T3\\\", \\\"T4\\\", \\\"T5\\\"]\", \"entities.type\": \"[\\\"CHEMICAL\\\", \\\"GENE-N\\\", \\\"GENE-Y\\\", \\\"GENE-Y\\\", \\\"GENE-N...\", \"entities.text\": \"[\\\"methotrexate\\\", \\\"tumor necrosis factor\\\", \\\"CD80\\\", ...\", \"entities.offsets\": \"[[1342, 1354], [1364, 1385], [805, 809], [810, 814...\", \"relations.type\": \"[]\", \"relations.arg1\": \"[]\", \"relations.arg2\": \"[]\"}", "columns": ["pmid", "text", "entities_id", "entities_type", "entities_text", "entities_offsets", "relations_type", "relations_arg1", "relations_arg2"], "columns_mapping": {"pmid": "pmid", "text": "text", "entities.id": "entities_id", "entities.type": "entities_type", "entities.text": "entities_text", "entities.offsets": "entities_offsets", "relations.type": "relations_type", "relations.arg1": "relations_arg1", "relations.arg2": "relations_arg2"}, "dataset_description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.\n", "dataset_name": "bigbio/chemprot"}, "chemprot_bigbio_kb": {"config_name": "chemprot_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"16357751\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"title and abstract\\\", \\\"text\\\":...\", \"entities\": \"[{\\\"offsets\\\": [[1342, 1354]], \\\"text\\\": [\\\"methotrexat...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The BioCreative VI Chemical-Protein interaction dataset identifies entities of\nchemicals and proteins and their likely relation to one other. Compounds are\ngenerally agonists (activators) or antagonists (inhibitors) of proteins.\n", "dataset_name": "bigbio/chemprot"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/chia": {"dataset_name": "bigbio/chia", "description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.", "downloads": 91, "configs": {"chia_source": {"config_name": "chia_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_fixed_source": {"config_name": "chia_fixed_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_without_scope_source": {"config_name": "chia_without_scope_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_without_scope_fixed_source": {"config_name": "chia_without_scope_fixed_source", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"text\": \"\\\"Patients with symptomatic CNS metastases or lepto...\", \"text_type\": \"\\\"exclusion\\\"\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "text", "text_type", "entities", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "text_type": "text_type", "entities": "entities", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}, "chia_bigbio_kb": {"config_name": "chia_bigbio_kb", "sample_row": "{\"id\": \"\\\"NCT00050349_exc\\\"\", \"document_id\": \"\\\"NCT00050349\\\"\", \"passages\": \"[{\\\"id\\\": \\\"NCT00050349_exc_text\\\", \\\"type\\\": \\\"exclusion...\", \"entities\": \"[{\\\"id\\\": \\\"NCT00050349_exc_T1\\\", \\\"text\\\": [\\\"CNS metast...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"NCT00050349_exc_R1\\\", \\\"type\\\": \\\"Has_tempora...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "A large annotated corpus of patient eligibility criteria extracted from 1,000\ninterventional, Phase IV clinical trials registered in ClinicalTrials.gov. This\ndataset includes 12,409 annotated eligibility criteria, represented by 41,487\ndistinctive entities of 15 entity types and 25,017 relationships of 12\nrelationship types.\n", "dataset_name": "bigbio/chia"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/ddi_corpus": {"dataset_name": "bigbio/ddi_corpus", "description": "The DDI corpus has been manually annotated with drugs and pharmacokinetics and pharmacodynamics interactions. It contains 1025 documents from two different sources: DrugBank database and MedLine.", "downloads": 357, "configs": {"ddi_corpus_source": {"config_name": "ddi_corpus_source", "sample_row": "{\"document_id\": \"\\\"19-norandrostenedione_ddi\\\"\", \"text\": \"\\\"No drug, nutritional supplement, food or herb int...\", \"entities\": \"[]\", \"relations\": \"[]\"}", "columns": ["document_id", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The DDI corpus has been manually annotated with drugs and pharmacokinetics and pharmacodynamics interactions. It contains 1025 documents from two different sources: DrugBank database and MedLine.\n", "dataset_name": "bigbio/ddi_corpus"}, "ddi_corpus_bigbio_kb": {"config_name": "ddi_corpus_bigbio_kb", "sample_row": "{\"id\": \"\\\"19-norandrostenedione_ddi\\\"\", \"document_id\": \"\\\"19-norandrostenedione_ddi\\\"\", \"passages\": \"[{\\\"id\\\": \\\"19-norandrostenedione_ddi__text\\\", \\\"type\\\":...\", \"entities\": \"[]\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The DDI corpus has been manually annotated with drugs and pharmacokinetics and pharmacodynamics interactions. It contains 1025 documents from two different sources: DrugBank database and MedLine.\n", "dataset_name": "bigbio/ddi_corpus"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/genetag": {"dataset_name": "bigbio/genetag", "description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..", "downloads": 25, "configs": {"genetaggold_source": {"config_name": "genetaggold_source", "sample_row": "{\"doc_id\": \"\\\"@@95229799480\\\"\", \"text\": \"\\\"Cervicovaginal foetal fibronectin in the predicti...\", \"tokenized_text\": \"[\\\"Cervicovaginal\\\", \\\"foetal\\\", \\\"fibronectin\\\", \\\"in\\\", ...\", \"pos_tags\": \"[\\\"JJ\\\", \\\"NEWGENE\\\", \\\"NEWGENE\\\", \\\"IN\\\", \\\"DT\\\", \\\"NN\\\", \\\"IN...\", \"entities\": \"[{\\\"text\\\": \\\"foetal fibronectin\\\", \\\"type\\\": \\\"NEWGENE\\\",...\"}", "columns": ["doc_id", "text", "tokenized_text", "pos_tags", "entities"], "columns_mapping": {"doc_id": "doc_id", "text": "text", "tokenized_text": "tokenized_text", "pos_tags": "pos_tags", "entities": "entities"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}, "genetaggold_bigbio_kb": {"config_name": "genetaggold_bigbio_kb", "sample_row": "{\"id\": \"\\\"@@95229799480\\\"\", \"document_id\": \"\\\"@@95229799480\\\"\", \"passages\": \"[{\\\"id\\\": \\\"@@95229799480_text\\\", \\\"type\\\": \\\"sentence\\\", ...\", \"entities\": \"[{\\\"offsets\\\": [[15, 33]], \\\"text\\\": [\\\"foetal fibronec...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}, "genetagcorrect_source": {"config_name": "genetagcorrect_source", "sample_row": "{\"doc_id\": \"\\\"@@95229799480\\\"\", \"text\": \"\\\"Cervicovaginal foetal fibronectin in the predicti...\", \"tokenized_text\": \"[\\\"Cervicovaginal\\\", \\\"foetal\\\", \\\"fibronectin\\\", \\\"in\\\", ...\", \"pos_tags\": \"[\\\"JJ\\\", \\\"NEWGENE\\\", \\\"NEWGENE\\\", \\\"IN\\\", \\\"DT\\\", \\\"NN\\\", \\\"IN...\", \"entities\": \"[{\\\"text\\\": \\\"fibronectin\\\", \\\"type\\\": \\\"NEWGENE\\\", \\\"token...\"}", "columns": ["doc_id", "text", "tokenized_text", "pos_tags", "entities"], "columns_mapping": {"doc_id": "doc_id", "text": "text", "tokenized_text": "tokenized_text", "pos_tags": "pos_tags", "entities": "entities"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}, "genetagcorrect_bigbio_kb": {"config_name": "genetagcorrect_bigbio_kb", "sample_row": "{\"id\": \"\\\"@@95229799480\\\"\", \"document_id\": \"\\\"@@95229799480\\\"\", \"passages\": \"[{\\\"id\\\": \\\"@@95229799480_text\\\", \\\"type\\\": \\\"sentence\\\", ...\", \"entities\": \"[{\\\"offsets\\\": [[22, 33]], \\\"text\\\": [\\\"fibronectin\\\"], ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "Named entity recognition (NER) is an important first step for text mining the biomedical literature.\nEvaluating the performance of biomedical NER systems is impossible without a standardized test corpus.\nThe annotation of such a corpus for gene/protein name NER is a difficult process due to the complexity\nof gene/protein names. We describe the construction and annotation of GENETAG, a corpus of 20K MEDLINE\u00ae\nsentences for gene/protein NER. 15K GENETAG sentences were used for the BioCreAtIvE Task 1A Competition..\n", "dataset_name": "bigbio/genetag"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/genia_term_corpus": {"dataset_name": "bigbio/genia_term_corpus", "description": "The identification of linguistic expressions referring to entities of interest in molecular biology such as proteins,\ngenes and cells is a fundamental task in biomolecular text mining. The GENIA technical term annotation covers the\nidentification of physical biological entities as well as other important terms. The corpus annotation covers the full\n1,999 abstracts of the primary GENIA corpus.", "downloads": 65, "configs": {"genia_term_corpus_source": {"config_name": "genia_term_corpus_source", "sample_row": "{\"document_id\": \"\\\"95369245\\\"\", \"title\": \"[{\\\"text\\\": \\\"IL-2 gene expression and NF-kappa B act...\", \"abstract\": \"[{\\\"text\\\": \\\"Activation of the CD28 surface receptor...\"}", "columns": ["document_id", "title", "abstract"], "columns_mapping": {"document_id": "document_id", "title": "title", "abstract": "abstract"}, "dataset_description": "The identification of linguistic expressions referring to entities of interest in molecular biology such as proteins,\ngenes and cells is a fundamental task in biomolecular text mining. The GENIA technical term annotation covers the\nidentification of physical biological entities as well as other important terms. The corpus annotation covers the full\n1,999 abstracts of the primary GENIA corpus.\n", "dataset_name": "bigbio/genia_term_corpus"}, "genia_term_corpus_bigbio_kb": {"config_name": "genia_term_corpus_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"95369245\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"IL-2 gene ...\", \"entities\": \"[{\\\"id\\\": \\\"3\\\", \\\"type\\\": \\\"other_name\\\", \\\"text\\\": [\\\"IL-2 ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The identification of linguistic expressions referring to entities of interest in molecular biology such as proteins,\ngenes and cells is a fundamental task in biomolecular text mining. The GENIA technical term annotation covers the\nidentification of physical biological entities as well as other important terms. The corpus annotation covers the full\n1,999 abstracts of the primary GENIA corpus.\n", "dataset_name": "bigbio/genia_term_corpus"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/hprd50": {"dataset_name": "bigbio/hprd50", "description": "HPRD50 is a dataset of randomly selected, hand-annotated abstracts of biomedical papers\nreferenced by the Human Protein Reference Database (HPRD). It is parsed in XML format,\nsplitting each abstract into sentences, and in each sentence there may be entities and\ninteractions between those entities. In this particular dataset, entities are all\nproteins and interactions are thus protein-protein interactions.\n\nMoreover, all entities are normalized to the HPRD database. These normalized terms are\nstored in each entity's 'type' attribute in the source XML. This means the dataset can\ndetermine e.g. that \"Janus kinase 2\" and \"Jak2\" are referencing the same normalized\nentity.\n\nBecause the dataset contains entities and relations, it is suitable for Named Entity\nRecognition and Relation Extraction.", "downloads": 71, "configs": {"hprd50_source": {"config_name": "hprd50_source", "sample_row": "{\"id\": \"\\\"HPRD50.d0\\\"\", \"origId\": \"\\\"10373544\\\"\", \"set\": \"null\", \"sentences\": \"[{\\\"id\\\": \\\"HPRD50.d0.s0\\\", \\\"origId\\\": \\\"10373544.1.1\\\", ...\"}", "columns": ["id", "origId", "set", "sentences"], "columns_mapping": {"id": "id", "origId": "origId", "set": "set", "sentences": "sentences"}, "dataset_description": "HPRD50 is a dataset of randomly selected, hand-annotated abstracts of biomedical papers\nreferenced by the Human Protein Reference Database (HPRD). It is parsed in XML format,\nsplitting each abstract into sentences, and in each sentence there may be entities and\ninteractions between those entities. In this particular dataset, entities are all\nproteins and interactions are thus protein-protein interactions.\n\nMoreover, all entities are normalized to the HPRD database. These normalized terms are\nstored in each entity's 'type' attribute in the source XML. This means the dataset can\ndetermine e.g. that \"Janus kinase 2\" and \"Jak2\" are referencing the same normalized\nentity.\n\nBecause the dataset contains entities and relations, it is suitable for Named Entity\nRecognition and Relation Extraction.\n", "dataset_name": "bigbio/hprd50"}, "hprd50_bigbio_kb": {"config_name": "hprd50_bigbio_kb", "sample_row": "{\"id\": \"\\\"HPRD50.d0\\\"\", \"document_id\": \"\\\"10373544\\\"\", \"passages\": \"[{\\\"id\\\": \\\"HPRD50.d0.s0\\\", \\\"type\\\": \\\"sentence\\\", \\\"text\\\"...\", \"entities\": \"[{\\\"id\\\": \\\"HPRD50.d0.s0.e0\\\", \\\"text\\\": [\\\"TFIIIC102\\\"], ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "HPRD50 is a dataset of randomly selected, hand-annotated abstracts of biomedical papers\nreferenced by the Human Protein Reference Database (HPRD). It is parsed in XML format,\nsplitting each abstract into sentences, and in each sentence there may be entities and\ninteractions between those entities. In this particular dataset, entities are all\nproteins and interactions are thus protein-protein interactions.\n\nMoreover, all entities are normalized to the HPRD database. These normalized terms are\nstored in each entity's 'type' attribute in the source XML. This means the dataset can\ndetermine e.g. that \"Janus kinase 2\" and \"Jak2\" are referencing the same normalized\nentity.\n\nBecause the dataset contains entities and relations, it is suitable for Named Entity\nRecognition and Relation Extraction.\n", "dataset_name": "bigbio/hprd50"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/iepa": {"dataset_name": "bigbio/iepa", "description": "The IEPA benchmark PPI corpus is designed for relation extraction. It was created from 303 PubMed abstracts, each of which contains a specific pair of co-occurring chemicals.", "downloads": 17, "configs": {"iepa_source": {"config_name": "iepa_source", "sample_row": "{\"id\": \"\\\"IEPA.d0\\\"\", \"PMID\": \"\\\"1645753\\\"\", \"origID\": \"\\\"258\\\"\", \"sentences\": \"[{\\\"id\\\": \\\"IEPA.d0.s0\\\", \\\"origID\\\": \\\"420\\\", \\\"offsets\\\": ...\"}", "columns": ["id", "PMID", "origID", "sentences"], "columns_mapping": {"id": "id", "PMID": "PMID", "origID": "origID", "sentences": "sentences"}, "dataset_description": "The IEPA benchmark PPI corpus is designed for relation extraction. It was created from 303 PubMed abstracts, each of which contains a specific pair of co-occurring chemicals.\n", "dataset_name": "bigbio/iepa"}, "iepa_bigbio_kb": {"config_name": "iepa_bigbio_kb", "sample_row": "{\"id\": \"\\\"IEPA.d0\\\"\", \"document_id\": \"\\\"1645753\\\"\", \"passages\": \"[{\\\"id\\\": \\\"IEPA.d0.s0\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"Oxytoc...\", \"entities\": \"[{\\\"id\\\": \\\"IEPA.d0.s0.e0\\\", \\\"text\\\": [\\\"Oxytocin\\\"], \\\"of...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"IEPA.d0.s0.i0\\\", \\\"type\\\": \\\"PPI\\\", \\\"arg1_id\\\":...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The IEPA benchmark PPI corpus is designed for relation extraction. It was created from 303 PubMed abstracts, each of which contains a specific pair of co-occurring chemicals.\n", "dataset_name": "bigbio/iepa"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/lll": {"dataset_name": "bigbio/lll", "description": "The LLL05 challenge task is to learn rules to extract protein/gene interactions from biology abstracts from the Medline\nbibliography database. The goal of the challenge is to test the ability of the participating IE systems to identify the\ninteractions and the gene/proteins that interact. The participants will test their IE patterns on a test set with the\naim of extracting the correct agent and target.The challenge focuses on information extraction of gene interactions in\nBacillus subtilis. Extracting gene interaction is the most popular event IE task in biology. Bacillus subtilis (Bs) is\na model bacterium and many papers have been published on direct gene interactions involved in sporulation. The gene\ninteractions are generally mentioned in the abstract and the full text of the paper is not needed. Extracting gene\ninteraction means, extracting the agent (proteins) and the target (genes) of all couples of genic interactions from\nsentences.", "downloads": 11, "configs": {"lll_source": {"config_name": "lll_source", "sample_row": "{\"id\": \"\\\"11069677-3\\\"\", \"sentence\": \"\\\"In vivo studies of the activity of four of the ki...\", \"words\": \"[{\\\"id\\\": \\\"0\\\", \\\"text\\\": \\\"In\\\", \\\"offsets\\\": [0, 2]}, {\\\"i...\", \"genic_interactions\": \"[{\\\"ref_id1\\\": \\\"29\\\", \\\"ref_id2\\\": \\\"35\\\"}, {\\\"ref_id1\\\": \\\"...\", \"agents\": \"[{\\\"ref_id\\\": \\\"29\\\"}, {\\\"ref_id\\\": \\\"31\\\"}]\", \"targets\": \"[{\\\"ref_id\\\": \\\"35\\\"}]\", \"lemmas\": \"[{\\\"ref_id\\\": \\\"0\\\", \\\"lemma\\\": \\\"in\\\"}, {\\\"ref_id\\\": \\\"1\\\", \\\"...\", \"syntactic_relations\": \"[{\\\"type\\\": \\\"comp_in:N-N\\\", \\\"ref_id1\\\": \\\"2\\\", \\\"ref_id2\\\"...\"}", "columns": ["id", "sentence", "words", "genic_interactions", "agents", "targets", "lemmas", "syntactic_relations"], "columns_mapping": {"id": "id", "sentence": "sentence", "words": "words", "genic_interactions": "genic_interactions", "agents": "agents", "targets": "targets", "lemmas": "lemmas", "syntactic_relations": "syntactic_relations"}, "dataset_description": "The LLL05 challenge task is to learn rules to extract protein/gene interactions from biology abstracts from the Medline\nbibliography database. The goal of the challenge is to test the ability of the participating IE systems to identify the\ninteractions and the gene/proteins that interact. The participants will test their IE patterns on a test set with the\naim of extracting the correct agent and target.The challenge focuses on information extraction of gene interactions in\nBacillus subtilis. Extracting gene interaction is the most popular event IE task in biology. Bacillus subtilis (Bs) is\na model bacterium and many papers have been published on direct gene interactions involved in sporulation. The gene\ninteractions are generally mentioned in the abstract and the full text of the paper is not needed. Extracting gene\ninteraction means, extracting the agent (proteins) and the target (genes) of all couples of genic interactions from\nsentences.\n", "dataset_name": "bigbio/lll"}, "lll_bigbio_kb": {"config_name": "lll_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"11069677-3\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"\\\", \\\"text\\\": [\\\"In...\", \"entities\": \"[{\\\"id\\\": \\\"0-agent-29\\\", \\\"type\\\": \\\"agent\\\", \\\"text\\\": [\\\"K...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"2\\\", \\\"type\\\": \\\"genic_interaction\\\", \\\"arg1_id...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The LLL05 challenge task is to learn rules to extract protein/gene interactions from biology abstracts from the Medline\nbibliography database. The goal of the challenge is to test the ability of the participating IE systems to identify the\ninteractions and the gene/proteins that interact. The participants will test their IE patterns on a test set with the\naim of extracting the correct agent and target.The challenge focuses on information extraction of gene interactions in\nBacillus subtilis. Extracting gene interaction is the most popular event IE task in biology. Bacillus subtilis (Bs) is\na model bacterium and many papers have been published on direct gene interactions involved in sporulation. The gene\ninteractions are generally mentioned in the abstract and the full text of the paper is not needed. Extracting gene\ninteraction means, extracting the agent (proteins) and the target (genes) of all couples of genic interactions from\nsentences.\n", "dataset_name": "bigbio/lll"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/med_qa": {"dataset_name": "bigbio/med_qa", "description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.", "downloads": 2239, "configs": {"med_qa_en_source": {"config_name": "med_qa_en_source", "sample_row": "{\"meta_info\": \"\\\"step2&3\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"answer_idx\": \"\\\"E\\\"\", \"answer\": \"\\\"Nitrofurantoin\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"Ampicillin\\\"}, {\\\"key\\\": \\\"B\\\",...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_en_bigbio_qa": {"config_name": "med_qa_en_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"Ampicillin\\\", \\\"Ceftriaxone\\\", \\\"Ciprofloxacin\\\", \\\"Do...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"Nitrofurantoin\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_en_4options_source": {"config_name": "med_qa_en_4options_source", "sample_row": "{\"meta_info\": \"\\\"step2&3\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"answer_idx\": \"\\\"D\\\"\", \"answer\": \"\\\"Nitrofurantoin\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"Ampicillin\\\"}, {\\\"key\\\": \\\"B\\\",...\", \"metamap_phrases\": \"[\\\"23 year old pregnant woman\\\", \\\"weeks presents\\\", \\\"...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options", "metamap_phrases"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options", "metamap_phrases": "metamap_phrases"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_en_4options_bigbio_qa": {"config_name": "med_qa_en_4options_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"A 23-year-old pregnant woman at 22 weeks gestatio...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"Ampicillin\\\", \\\"Ceftriaxone\\\", \\\"Doxycycline\\\", \\\"Nitr...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"Nitrofurantoin\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_source": {"config_name": "med_qa_zh_source", "sample_row": "{\"meta_info\": \"\\\"\\\\u7b2c\\\\u4e09\\\\u90e8\\\\u5206\\\\u3000\\\\u7cbe\\\\u795e\\\\u795e\\\\...\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"answer_idx\": \"\\\"B\\\"\", \"answer\": \"\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"190\\\\uff5e220mmH2O\\\\uff081.8...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_bigbio_qa": {"config_name": "med_qa_zh_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"190\\\\uff5e220mmH2O\\\\uff081.86\\\\uff5e2.16kPa\\\\uff09\\\",...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"]...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_4options_source": {"config_name": "med_qa_zh_4options_source", "sample_row": "{\"meta_info\": \"\\\"\\\\u7b2c\\\\u4e09\\\\u90e8\\\\u5206\\\\u3000\\\\u7cbe\\\\u795e\\\\u795e\\\\...\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"answer_idx\": \"\\\"A\\\"\", \"answer\": \"\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"80\\\\uff5e180mmH2O\\\\uff080.78...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_zh_4options_bigbio_qa": {"config_name": "med_qa_zh_4options_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u5367\\\\u4f4d\\\\u8170\\\\u690e\\\\u7a7f\\\\u523a\\\\uff0c\\\\u8111\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\", ...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"80\\\\uff5e180mmH2O\\\\uff080.78\\\\uff5e1.76kPa\\\\uff09\\\"]...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_source": {"config_name": "med_qa_tw_source", "sample_row": "{\"meta_info\": \"\\\"taiwanese_test_Q\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"answer_idx\": \"\\\"C\\\"\", \"answer\": \"\\\"\\\\u97cc\\\\u5e36\\\\u9b06\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027\\\\...\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7e2e\\\"}...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_bigbio_qa": {"config_name": "med_qa_tw_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7e2e\\\", \\\"\\\\u808c\\\\u529b\\\\u6e1b\\\\u...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"\\\\u97cc\\\\u5e36\\\\u9b06\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_en_source": {"config_name": "med_qa_tw_en_source", "sample_row": "{\"meta_info\": \"\\\"taiwanese_test_Q\\\"\", \"question\": \"\\\"After the reaction physiology Which is not bedrid...\", \"answer_idx\": \"\\\"C\\\"\", \"answer\": \"\\\"Ligamentous laxity, increased ductility\\\"\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"Muscle atrophy\\\"}, {\\\"key\\\": ...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_en_bigbio_qa": {"config_name": "med_qa_tw_en_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"After the reaction physiology Which is not bedrid...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"Muscle atrophy\\\", \\\"Weakness\\\", \\\"Ligamentous laxity...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"Ligamentous laxity, increased ductility\\\"]\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_zh_source": {"config_name": "med_qa_tw_zh_source", "sample_row": "{\"meta_info\": \"\\\"taiwanese_test_Q\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"answer_idx\": \"\\\"C\\\"\", \"answer\": \"\\\"\\\\u97e7\\\\u5e26\\\\u677e\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027\\\\...\", \"options\": \"[{\\\"key\\\": \\\"A\\\", \\\"value\\\": \\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7f29\\\"}...\"}", "columns": ["meta_info", "question", "answer_idx", "answer", "options"], "columns_mapping": {"meta_info": "meta_info", "question": "question", "answer_idx": "answer_idx", "answer": "answer", "options": "options"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}, "med_qa_tw_zh_bigbio_qa": {"config_name": "med_qa_tw_zh_bigbio_qa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question_id\": \"\\\"0\\\"\", \"document_id\": \"\\\"0\\\"\", \"question\": \"\\\"\\\\u4e0b\\\\u5217\\\\u4f55\\\\u8005\\\\u4e0d\\\\u662f\\\\u75c5\\\\u4eba\\\\...\", \"type\": \"\\\"multiple_choice\\\"\", \"choices\": \"[\\\"\\\\u808c\\\\u8089\\\\u840e\\\\u7f29\\\", \\\"\\\\u808c\\\\u529b\\\\u51cf\\\\u...\", \"context\": \"\\\"\\\"\", \"answer\": \"[\\\"\\\\u97e7\\\\u5e26\\\\u677e\\\\u5f1b\\\\uff0c\\\\u5ef6\\\\u5c55\\\\u6027...\"}", "columns": ["id", "question_id", "document_id", "question", "type", "choices", "context", "answer"], "columns_mapping": {"id": "id", "question_id": "question_id", "document_id": "document_id", "question": "question", "type": "type", "choices": "choices", "context": "context", "answer": "answer"}, "dataset_description": "In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,\ncollected from the professional medical board exams. It covers three languages: English, simplified Chinese, and\ntraditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together\nwith the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading\ncomprehension models can obtain necessary knowledge for answering the questions.\n", "dataset_name": "bigbio/med_qa"}}, "tags": ["multilinguality:multilingual", "language:en", "language:zh"], "is_gated": false}, "bigbio/medmentions": {"dataset_name": "bigbio/medmentions", "description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.", "downloads": 367, "configs": {"medmentions_full_source": {"config_name": "medmentions_full_source", "sample_row": "{\"pmid\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as a modifier o...\", \"entities\": \"[{\\\"offsets\\\": [[0, 5]], \\\"text\\\": [\\\"DCTN4\\\"], \\\"semanti...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}, "medmentions_full_bigbio_kb": {"config_name": "medmentions_full_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"id\\\": \\\"110\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"T116\\\", \\\"text\\\": [\\\"DCTN4\\\"], \\\"o...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}, "medmentions_st21pv_source": {"config_name": "medmentions_st21pv_source", "sample_row": "{\"pmid\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as a modifier o...\", \"entities\": \"[{\\\"offsets\\\": [[0, 5]], \\\"text\\\": [\\\"DCTN4\\\"], \\\"semanti...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}, "medmentions_st21pv_bigbio_kb": {"config_name": "medmentions_st21pv_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"25763772\\\"\", \"passages\": \"[{\\\"id\\\": \\\"67\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"DCTN4 as ...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"T103\\\", \\\"text\\\": [\\\"DCTN4\\\"], \\\"o...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "MedMentions is a new manually annotated resource for the recognition of biomedical concepts.\nWhat distinguishes MedMentions from other annotated biomedical corpora is its size (over 4,000\nabstracts and over 350,000 linked mentions), as well as the size of the concept ontology (over\n3 million concepts from UMLS 2017) and its broad coverage of biomedical disciplines.\n\nCorpus: The MedMentions corpus consists of 4,392 papers (Titles and Abstracts) randomly selected\nfrom among papers released on PubMed in 2016, that were in the biomedical field, published in\nthe English language, and had both a Title and an Abstract.\n\nAnnotators: We recruited a team of professional annotators with rich experience in biomedical\ncontent curation to exhaustively annotate all UMLS\u00ae (2017AA full version) entity mentions in\nthese papers.\n\nAnnotation quality: We did not collect stringent IAA (Inter-annotator agreement) data. To gain\ninsight on the annotation quality of MedMentions, we randomly selected eight papers from the\nannotated corpus, containing a total of 469 concepts. Two biologists ('Reviewer') who did not\nparticipate in the annotation task then each reviewed four papers. The agreement between\nReviewers and Annotators, an estimate of the Precision of the annotations, was 97.3%.\n", "dataset_name": "bigbio/medmentions"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/meqsum": {"dataset_name": "bigbio/meqsum", "description": "Dataset for medical question summarization introduced in the ACL 2019 paper \"On the Summarization of Consumer Health\nQuestions\". Question understanding is one of the main challenges in question answering. In real world applications,\nusers often submit natural language questions that are longer than needed and include peripheral information that\nincreases the complexity of the question, leading to substantially more false positives in answer retrieval. In this\npaper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000\nsummarized consumer health questions.", "downloads": 24, "configs": {"meqsum_source": {"config_name": "meqsum_source", "sample_row": "{\"File\": \"\\\"1-131188152.xml.txt\\\"\", \"CHQ\": \"\\\"SUBJECT: who and where to get cetirizine - D\\\\nMES...\", \"Summary\": \"\\\"Who manufactures cetirizine?\\\"\"}", "columns": ["File", "CHQ", "Summary"], "columns_mapping": {"File": "File", "CHQ": "CHQ", "Summary": "Summary"}, "dataset_description": "Dataset for medical question summarization introduced in the ACL 2019 paper \"On the Summarization of Consumer Health\nQuestions\". Question understanding is one of the main challenges in question answering. In real world applications,\nusers often submit natural language questions that are longer than needed and include peripheral information that\nincreases the complexity of the question, leading to substantially more false positives in answer retrieval. In this\npaper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000\nsummarized consumer health questions.\n", "dataset_name": "bigbio/meqsum"}, "meqsum_bigbio_t2t": {"config_name": "meqsum_bigbio_t2t", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"1-131188152.xml.txt\\\"\", \"text_1\": \"\\\"SUBJECT: who and where to get cetirizine - D\\\\nMES...\", \"text_2\": \"\\\"Who manufactures cetirizine?\\\"\", \"text_1_name\": \"\\\"\\\"\", \"text_2_name\": \"\\\"\\\"\"}", "columns": ["id", "document_id", "text_1", "text_2", "text_1_name", "text_2_name"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "text_1_name": "text_1_name", "text_2_name": "text_2_name"}, "dataset_description": "Dataset for medical question summarization introduced in the ACL 2019 paper \"On the Summarization of Consumer Health\nQuestions\". Question understanding is one of the main challenges in question answering. In real world applications,\nusers often submit natural language questions that are longer than needed and include peripheral information that\nincreases the complexity of the question, leading to substantially more false positives in answer retrieval. In this\npaper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000\nsummarized consumer health questions.\n", "dataset_name": "bigbio/meqsum"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/mirna": {"dataset_name": "bigbio/mirna", "description": "The corpus consists of 301 Medline citations. The documents were screened for\nmentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually\nannotated. The corpus comprises of two separate files, a train and a test set, coming\nfrom 201 and 100 documents respectively.", "downloads": 43, "configs": {"mirna_source": {"config_name": "mirna_source", "sample_row": "{\"passages\": \"[{\\\"document_id\\\": \\\"miRNA-corp.d0\\\", \\\"type\\\": \\\"title\\\",...\"}", "columns": ["passages"], "columns_mapping": {"passages": "passages"}, "dataset_description": "The corpus consists of 301 Medline citations. The documents were screened for\nmentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually\nannotated. The corpus comprises of two separate files, a train and a test set, coming\nfrom 201 and 100 documents respectively. \n", "dataset_name": "bigbio/mirna"}, "mirna_bigbio_kb": {"config_name": "mirna_bigbio_kb", "sample_row": "{\"id\": \"\\\"36\\\"\", \"document_id\": \"\\\"miRNA-corp.d0\\\"\", \"passages\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"Identifica...\", \"entities\": \"[{\\\"id\\\": \\\"7\\\", \\\"type\\\": \\\"Non-Specific_miRNAs\\\", \\\"text\\\"...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"27\\\", \\\"type\\\": \\\"Non-Specific_miRNAs-Disease...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The corpus consists of 301 Medline citations. The documents were screened for\nmentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually\nannotated. The corpus comprises of two separate files, a train and a test set, coming\nfrom 201 and 100 documents respectively. \n", "dataset_name": "bigbio/mirna"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/mqp": {"dataset_name": "bigbio/mqp", "description": "Medical Question Pairs dataset by McCreery et al (2020) contains pairs of medical questions and paraphrased versions of \nthe question prepared by medical professional. Paraphrased versions were labelled as similar (syntactically dissimilar \nbut contextually similar ) or dissimilar (syntactically may look similar but contextually dissimilar). Labels 1: similar, 0: dissimilar", "downloads": 140, "configs": {"mqp_source": {"config_name": "mqp_source", "sample_row": "{\"document_id\": \"\\\"1\\\"\", \"text_1\": \"\\\"After how many hour from drinking an antibiotic c...\", \"text_2\": \"\\\"I have a party tonight and I took my last dose of...\", \"label\": \"\\\"1\\\"\"}", "columns": ["document_id", "text_1", "text_2", "label"], "columns_mapping": {"document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "label": "label"}, "dataset_description": "Medical Question Pairs dataset by McCreery et al (2020) contains pairs of medical questions and paraphrased versions of \nthe question prepared by medical professional. Paraphrased versions were labelled as similar (syntactically dissimilar \nbut contextually similar ) or dissimilar (syntactically may look similar but contextually dissimilar). Labels 1: similar, 0: dissimilar\n", "dataset_name": "bigbio/mqp"}, "mqp_bigbio_pairs": {"config_name": "mqp_bigbio_pairs", "sample_row": "{\"id\": \"\\\"1\\\"\", \"document_id\": \"\\\"1\\\"\", \"text_1\": \"\\\"After how many hour from drinking an antibiotic c...\", \"text_2\": \"\\\"I have a party tonight and I took my last dose of...\", \"label\": \"\\\"1\\\"\"}", "columns": ["id", "document_id", "text_1", "text_2", "label"], "columns_mapping": {"id": "id", "document_id": "document_id", "text_1": "text_1", "text_2": "text_2", "label": "label"}, "dataset_description": "Medical Question Pairs dataset by McCreery et al (2020) contains pairs of medical questions and paraphrased versions of \nthe question prepared by medical professional. Paraphrased versions were labelled as similar (syntactically dissimilar \nbut contextually similar ) or dissimilar (syntactically may look similar but contextually dissimilar). Labels 1: similar, 0: dissimilar\n", "dataset_name": "bigbio/mqp"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "greek_legal_code": {"dataset_name": "greek_legal_code", "description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.", "downloads": 661, "configs": {"volume": {"config_name": "volume", "sample_row": "{\"text\": \"\\\"5. \\\\u0391\\\\u039d\\\\u0391\\\\u0393\\\\u039a. \\\\u039d\\\\u039f\\\\u...\", \"label\": \"41\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.\n", "dataset_name": "greek_legal_code"}, "chapter": {"config_name": "chapter", "sample_row": "{\"text\": \"\\\"5. \\\\u0391\\\\u039d\\\\u0391\\\\u0393\\\\u039a. \\\\u039d\\\\u039f\\\\u...\", \"label\": \"239\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.\n", "dataset_name": "greek_legal_code"}, "subject": {"config_name": "subject", "sample_row": "{\"text\": \"\\\"5. \\\\u0391\\\\u039d\\\\u0391\\\\u0393\\\\u039a. \\\\u039d\\\\u039f\\\\u...\", \"label\": \"1405\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Greek_Legal_Code contains 47k classified legal resources from Greek Legislation. Its origin is \u201cPermanent Greek Legislation Code - Raptarchis\u201d,\na collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.\n", "dataset_name": "greek_legal_code"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:el"], "is_gated": false}, "harem": {"dataset_name": "harem", "description": "The HAREM is a Portuguese language corpus commonly used for Named Entity Recognition tasks. It includes about 93k words, from 129 different texts,\nfrom several genres, and language varieties. The split of this dataset version follows the division made by [1], where 7% HAREM\ndocuments are the validation set and the miniHAREM corpus (with about 65k words) is the test set. There are two versions of the dataset set,\na version that has a total of 10 different named entity classes (Person, Organization, Location, Value, Date, Title, Thing, Event,\nAbstraction, and Other) and a \"selective\" version with only 5 classes (Person, Organization, Location, Value, and Date).\n\nIt's important to note that the original version of the HAREM dataset has 2 levels of NER details, namely \"Category\" and \"Sub-type\".\nThe dataset version processed here ONLY USE the \"Category\" level of the original dataset.\n\n[1] Souza, F\u00e1bio, Rodrigo Nogueira, and Roberto Lotufo. \"BERTimbau: Pretrained BERT Models for Brazilian Portuguese.\" Brazilian Conference on Intelligent Systems. Springer, Cham, 2020.", "downloads": 518, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"HAREM-871-07800\\\"\", \"tokens\": \"[\\\"Abra\\\\u00e7o\\\", \\\"P\\\\u00e1gina\\\", \\\"Principal\\\", \\\"ASSOC...\", \"ner_tags\": \"[3, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 3, 0, 0, 3...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThe HAREM is a Portuguese language corpus commonly used for Named Entity Recognition tasks. It includes about 93k words, from 129 different texts,\nfrom several genres, and language varieties. The split of this dataset version follows the division made by [1], where 7% HAREM\ndocuments are the validation set and the miniHAREM corpus (with about 65k words) is the test set. There are two versions of the dataset set,\na version that has a total of 10 different named entity classes (Person, Organization, Location, Value, Date, Title, Thing, Event,\nAbstraction, and Other) and a \"selective\" version with only 5 classes (Person, Organization, Location, Value, and Date).\n\nIt's important to note that the original version of the HAREM dataset has 2 levels of NER details, namely \"Category\" and \"Sub-type\".\nThe dataset version processed here ONLY USE the \"Category\" level of the original dataset.\n\n[1] Souza, F\u00e1bio, Rodrigo Nogueira, and Roberto Lotufo. \"BERTimbau: Pretrained BERT Models for Brazilian Portuguese.\" Brazilian Conference on Intelligent Systems. Springer, Cham, 2020.\n", "dataset_name": "harem"}, "selective": {"config_name": "selective", "sample_row": "{\"id\": \"\\\"HAREM-871-07800\\\"\", \"tokens\": \"[\\\"Abra\\\\u00e7o\\\", \\\"P\\\\u00e1gina\\\", \\\"Principal\\\", \\\"ASSOC...\", \"ner_tags\": \"[3, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 3, 0, 0, 3...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThe HAREM is a Portuguese language corpus commonly used for Named Entity Recognition tasks. It includes about 93k words, from 129 different texts,\nfrom several genres, and language varieties. The split of this dataset version follows the division made by [1], where 7% HAREM\ndocuments are the validation set and the miniHAREM corpus (with about 65k words) is the test set. There are two versions of the dataset set,\na version that has a total of 10 different named entity classes (Person, Organization, Location, Value, Date, Title, Thing, Event,\nAbstraction, and Other) and a \"selective\" version with only 5 classes (Person, Organization, Location, Value, and Date).\n\nIt's important to note that the original version of the HAREM dataset has 2 levels of NER details, namely \"Category\" and \"Sub-type\".\nThe dataset version processed here ONLY USE the \"Category\" level of the original dataset.\n\n[1] Souza, F\u00e1bio, Rodrigo Nogueira, and Roberto Lotufo. \"BERTimbau: Pretrained BERT Models for Brazilian Portuguese.\" Brazilian Conference on Intelligent Systems. Springer, Cham, 2020.\n", "dataset_name": "harem"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "has_part": {"dataset_name": "has_part", "description": "This dataset is a new knowledge-base (KB) of hasPart relationships, extracted from a large corpus of generic statements. Complementary to other resources available, it is the first which is all three of: accurate (90% precision), salient (covers relationships a person may mention), and has high coverage of common terms (approximated as within a 10 year old\u2019s vocabulary), as well as having several times more hasPart entries than in the popular ontologies ConceptNet and WordNet. In addition, it contains information about quantifiers, argument modifiers, and links the entities to appropriate concepts in Wikipedia and WordNet.", "downloads": 318, "configs": {"default": {"config_name": "default", "sample_row": "{\"arg1\": \"\\\"snowdrop\\\"\", \"arg2\": \"\\\"carpel\\\"\", \"score\": \"0.9990746974945068\", \"wikipedia_primary_page\": \"[\\\"Galanthus\\\"]\", \"synset\": \"[\\\"wn.carpel.n.01\\\"]\"}", "columns": ["arg1", "arg2", "score", "wikipedia_primary_page", "synset"], "columns_mapping": {"arg1": "arg1", "arg2": "arg2", "score": "score", "wikipedia_primary_page": "wikipedia_primary_page", "synset": "synset"}, "dataset_description": "This dataset is a new knowledge-base (KB) of hasPart relationships, extracted from a large corpus of generic statements. Complementary to other resources available, it is the first which is all three of: accurate (90% precision), salient (covers relationships a person may mention), and has high coverage of common terms (approximated as within a 10 year old\u2019s vocabulary), as well as having several times more hasPart entries than in the popular ontologies ConceptNet and WordNet. In addition, it contains information about quantifiers, argument modifiers, and links the entities to appropriate concepts in Wikipedia and WordNet.\n", "dataset_name": "has_part"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|other-Generics-KB", "language:en", "Meronym-Prediction"], "is_gated": false}, "hate_speech18": {"dataset_name": "hate_speech18", "description": "These files contain text extracted from Stormfront, a white supremacist forum. A random set of\nforums posts have been sampled from several subforums and split into sentences. Those sentences\nhave been manually labelled as containing hate speech or not, according to certain annotation guidelines.", "downloads": 12316, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"As of March 13th , 2014 , the booklet had been do...\", \"user_id\": \"572066\", \"subforum_id\": \"1346\", \"num_contexts\": \"0\", \"label\": \"0\"}", "columns": ["text", "user_id", "subforum_id", "num_contexts", "label"], "columns_mapping": {"text": "text", "user_id": "user_id", "subforum_id": "subforum_id", "num_contexts": "num_contexts", "label": "label"}, "dataset_description": "These files contain text extracted from Stormfront, a white supremacist forum. A random set of\nforums posts have been sampled from several subforums and split into sentences. Those sentences\nhave been manually labelled as containing hate speech or not, according to certain annotation guidelines.\n", "dataset_name": "hate_speech18"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "hate_speech_filipino": {"dataset_name": "hate_speech_filipino", "description": " Contains 10k tweets (training set) that are labeled as hate speech or non-hate speech. Released with 4,232 validation and 4,232 testing samples. Collected during the 2016 Philippine Presidential Elections.", "downloads": 314, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Inaasahan na ni Vice President Jejomar Binay na m...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": " Contains 10k tweets (training set) that are labeled as hate speech or non-hate speech. Released with 4,232 validation and 4,232 testing samples. Collected during the 2016 Philippine Presidential Elections.\n", "dataset_name": "hate_speech_filipino"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-analysis", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|other-twitter-data-philippine-election", "language:tl"], "is_gated": false}, "hate_speech_offensive": {"dataset_name": "hate_speech_offensive", "description": "An annotated dataset for hate speech and offensive language detection on tweets.", "downloads": 8425, "configs": {"default": {"config_name": "default", "sample_row": "{\"count\": \"3\", \"hate_speech_count\": \"0\", \"offensive_language_count\": \"0\", \"neither_count\": \"3\", \"class\": \"2\", \"tweet\": \"\\\"!!! RT @mayasolovely: As a woman you shouldn't co...\"}", "columns": ["count", "hate_speech_count", "offensive_language_count", "neither_count", "class", "tweet"], "columns_mapping": {"count": "count", "hate_speech_count": "hate_speech_count", "offensive_language_count": "offensive_language_count", "neither_count": "neither_count", "class": "class", "tweet": "tweet"}, "dataset_description": "An annotated dataset for hate speech and offensive language detection on tweets.\n", "dataset_name": "hate_speech_offensive"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "hate-speech-detection"], "is_gated": false}, "hate_speech_pl": {"dataset_name": "hate_speech_pl", "description": "HateSpeech corpus in the current version contains over 2000 posts crawled from public Polish web. They represent various types and degrees of offensive language, expressed toward minorities (eg. ethnical, racial). The data were annotated manually.", "downloads": 314, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"1\", \"text_id\": \"121713\", \"annotator_id\": \"1\", \"minority_id\": \"72\", \"negative_emotions\": \"true\", \"call_to_action\": \"true\", \"source_of_knowledge\": \"2\", \"irony_sarcasm\": \"true\", \"topic\": \"18\", \"text\": \"\\\" Niemiec m\\\\u00f3wi c...\", \"rating\": \"0\"}", "columns": ["id", "text_id", "annotator_id", "minority_id", "negative_emotions", "call_to_action", "source_of_knowledge", "irony_sarcasm", "topic", "text", "rating"], "columns_mapping": {"id": "id", "text_id": "text_id", "annotator_id": "annotator_id", "minority_id": "minority_id", "negative_emotions": "negative_emotions", "call_to_action": "call_to_action", "source_of_knowledge": "source_of_knowledge", "irony_sarcasm": "irony_sarcasm", "topic": "topic", "text": "text", "rating": "rating"}, "dataset_description": "HateSpeech corpus in the current version contains over 2000 posts crawled from public Polish web. They represent various types and degrees of offensive language, expressed toward minorities (eg. ethnical, racial). The data were annotated manually.\n", "dataset_name": "hate_speech_pl"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "task_ids:sentiment-classification", "task_ids:sentiment-scoring", "task_ids:topic-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "hate_speech_portuguese": {"dataset_name": "hate_speech_portuguese", "description": "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate').", "downloads": 322, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"@__andrea__b \\\\nO cara vive em outro mundo\\\\nN\\\\u00e...\", \"label\": \"1\", \"hatespeech_G1\": \"\\\"1\\\"\", \"annotator_G1\": \"\\\"A\\\"\", \"hatespeech_G2\": \"\\\"1\\\"\", \"annotator_G2\": \"\\\"V\\\"\", \"hatespeech_G3\": \"\\\"0\\\"\", \"annotator_G3\": \"\\\"E\\\"\"}", "columns": ["text", "label", "hatespeech_G1", "annotator_G1", "hatespeech_G2", "annotator_G2", "hatespeech_G3", "annotator_G3"], "columns_mapping": {"text": "text", "label": "label", "hatespeech_G1": "hatespeech_G1", "annotator_G1": "annotator_G1", "hatespeech_G2": "hatespeech_G2", "annotator_G2": "annotator_G2", "hatespeech_G3": "hatespeech_G3", "annotator_G3": "annotator_G3"}, "dataset_description": "Portuguese dataset for hate speech detection composed of 5,668 tweets with binary annotations (i.e. 'hate' vs. 'no-hate').\n", "dataset_name": "hate_speech_portuguese"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt", "hate-speech-detection"], "is_gated": false}, "hatexplain": {"dataset_name": "hatexplain", "description": "Hatexplain is the first benchmark hate speech dataset covering multiple aspects of the issue. Each post in the dataset is annotated from three different perspectives: the basic, commonly used 3-class classification (i.e., hate, offensive or normal), the target community (i.e., the community that has been the victim of hate speech/offensive speech in the post), and the rationales, i.e., the portions of the post on which their labelling decision (as hate, offensive or normal) is based.", "downloads": 1393, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"23107796_gab\\\"\", \"annotators.label\": \"[0, 2, 2]\", \"annotators.annotator_id\": \"[203, 204, 233]\", \"annotators.target\": \"[[\\\"Hindu\\\", \\\"Islam\\\"], [\\\"Hindu\\\", \\\"Islam\\\"], [\\\"Hindu\\\",...\", \"rationales\": \"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...\", \"post_tokens\": \"[\\\"u\\\", \\\"really\\\", \\\"think\\\", \\\"i\\\", \\\"would\\\", \\\"not\\\", \\\"hav...\"}", "columns": ["id", "annotators_label", "annotators_annotator_id", "annotators_target", "rationales", "post_tokens"], "columns_mapping": {"id": "id", "annotators.label": "annotators_label", "annotators.annotator_id": "annotators_annotator_id", "annotators.target": "annotators_target", "rationales": "rationales", "post_tokens": "post_tokens"}, "dataset_description": "Hatexplain is the first benchmark hate speech dataset covering multiple aspects of the issue. Each post in the dataset is annotated from three different perspectives: the basic, commonly used 3-class classification (i.e., hate, offensive or normal), the target community (i.e., the community that has been the victim of hate speech/offensive speech in the post), and the rationales, i.e., the portions of the post on which their labelling decision (as hate, offensive or normal) is based.\n", "dataset_name": "hatexplain"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "hate-speech-detection"], "is_gated": false}, "hausa_voa_ner": {"dataset_name": "hausa_voa_ner", "description": "The Hausa VOA NER dataset is a labeled dataset for named entity recognition in Hausa. The texts were obtained from\nHausa Voice of America News articles https://www.voahausa.com/ . We concentrate on\nfour types of named entities: persons [PER], locations [LOC], organizations [ORG], and dates & time [DATE].\n\nThe Hausa VOA NER data files contain 2 columns separated by a tab ('\\t'). Each word has been put on a separate line and\nthere is an empty line after each sentences i.e the CoNLL format. The first item on each line is a word, the second\nis the named entity tag. The named entity tags have the format I-TYPE which means that the word is inside a phrase\nof type TYPE. For every multi-word expression like 'New York', the first word gets a tag B-TYPE and the subsequent words\nhave tags I-TYPE, a word with tag O is not part of a phrase. The dataset is in the BIO tagging scheme.\n\nFor more details, see https://www.aclweb.org/anthology/2020.emnlp-main.204/", "downloads": 288, "configs": {"hausa_voa_ner": {"config_name": "hausa_voa_ner", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ya\\\", \\\"Kammala\\\", \\\"Ziyarar\\\", \\\"Yakin\\\", \\\"Neman\\\", \\\"Za...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 5]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The Hausa VOA NER dataset is a labeled dataset for named entity recognition in Hausa. The texts were obtained from\nHausa Voice of America News articles https://www.voahausa.com/ . We concentrate on\nfour types of named entities: persons [PER], locations [LOC], organizations [ORG], and dates & time [DATE].\n\nThe Hausa VOA NER data files contain 2 columns separated by a tab ('\t'). Each word has been put on a separate line and\nthere is an empty line after each sentences i.e the CoNLL format. The first item on each line is a word, the second\nis the named entity tag. The named entity tags have the format I-TYPE which means that the word is inside a phrase\nof type TYPE. For every multi-word expression like 'New York', the first word gets a tag B-TYPE and the subsequent words\nhave tags I-TYPE, a word with tag O is not part of a phrase. The dataset is in the BIO tagging scheme.\n\nFor more details, see https://www.aclweb.org/anthology/2020.emnlp-main.204/\n", "dataset_name": "hausa_voa_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ha"], "is_gated": false}, "hausa_voa_topics": {"dataset_name": "hausa_voa_topics", "description": "A collection of news article headlines in Hausa from VOA Hausa.\nEach headline is labeled with one of the following classes: Nigeria,\nAfrica, World, Health or Politics.\n\nThe dataset was presented in the paper:\nHedderich, Adelani, Zhu, Alabi, Markus, Klakow: Transfer Learning and\nDistant Supervision for Multilingual Transformer Models: A Study on\nAfrican Languages (EMNLP 2020).", "downloads": 311, "configs": {"default": {"config_name": "default", "sample_row": "{\"news_title\": \"\\\"Atiku Abubakar Ya Kada Kuri'arsa A Jimeta A Jihar...\", \"label\": \"3\"}", "columns": ["news_title", "label"], "columns_mapping": {"news_title": "news_title", "label": "label"}, "dataset_description": "A collection of news article headlines in Hausa from VOA Hausa.\nEach headline is labeled with one of the following classes: Nigeria,\nAfrica, World, Health or Politics.\n\nThe dataset was presented in the paper:\nHedderich, Adelani, Zhu, Alabi, Markus, Klakow: Transfer Learning and\nDistant Supervision for Multilingual Transformer Models: A Study on\nAfrican Languages (EMNLP 2020).\n", "dataset_name": "hausa_voa_topics"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ha"], "is_gated": false}, "head_qa": {"dataset_name": "head_qa", "description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\n\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.", "downloads": 1148, "configs": {"es": {"config_name": "es", "sample_row": "{\"name\": \"\\\"Cuaderno_2013_1_B\\\"\", \"year\": \"\\\"2013\\\"\", \"category\": \"\\\"biology\\\"\", \"qid\": \"1\", \"qtext\": \"\\\"Los potenciales postsin\\\\u00e1pticos excitadores:\\\"...\", \"ra\": \"3\", \"image\": \"null\", \"answers\": \"[{\\\"aid\\\": 1, \\\"atext\\\": \\\"Son de tipo todo o nada.\\\"}, ...\"}", "columns": ["dataset_name", "year", "category", "qid", "qtext", "ra", "image", "answers"], "columns_mapping": {"dataset_name": "dataset_name", "year": "year", "category": "category", "qid": "qid", "qtext": "qtext", "ra": "ra", "image": "image", "answers": "answers"}, "dataset_description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\n\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "dataset_name": "head_qa"}, "en": {"config_name": "en", "sample_row": "{\"name\": \"\\\"Cuaderno_2013_1_B\\\"\", \"year\": \"\\\"2013\\\"\", \"category\": \"\\\"biology\\\"\", \"qid\": \"1\", \"qtext\": \"\\\"The excitatory postsynaptic potentials:\\\"\", \"ra\": \"3\", \"image\": \"null\", \"answers\": \"[{\\\"aid\\\": 1, \\\"atext\\\": \\\"They are all or nothing.\\\"}, ...\"}", "columns": ["dataset_name", "year", "category", "qid", "qtext", "ra", "image", "answers"], "columns_mapping": {"dataset_name": "dataset_name", "year": "year", "category": "category", "qid": "qid", "qtext": "qtext", "ra": "ra", "image": "image", "answers": "answers"}, "dataset_description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\n\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "dataset_name": "head_qa"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en", "language:es"], "is_gated": false}, "hebrew_projectbenyehuda": {"dataset_name": "hebrew_projectbenyehuda", "description": "This repository contains a dump of thousands of public domain works in Hebrew, from Project Ben-Yehuda, in plaintext UTF-8 files, with and without diacritics (nikkud). The metadata (pseudocatalogue.csv) file is a list of titles, authors, genres, and file paths, to help you process the dump.\nAll these works are in the public domain, so you are free to make any use of them, and do not need to ask for permission.\nThere are 10078 files, 3181136 lines", "downloads": 290, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"10\", \"url\": \"\\\"https://raw.githubusercontent.com/projectbenyehud...\", \"title\": \"\\\"\\\\u05d7\\\\u05e6\\\\u05d9-\\\\u05e0\\\\u05d7\\\\u05de\\\\u05d4\\\"\", \"authors\": \"\\\"\\\\u05d0\\\\u05d7\\\\u05d3 \\\\u05d4\\\\u05e2\\\\u05dd\\\"\", \"translators\": \"\\\"\\\"\", \"original_language\": \"\\\"380425\\\"\", \"genre\": \"\\\"\\\"\", \"source_edition\": \"\\\"\\\"\", \"text\": \"\\\"\\\\n\\\\n\\\\n\\\\t\\\\n\\\\t\\\\u05d7\\\\u05e6\\\\u05d9-\\\\u05e0\\\\u05d7\\\\u05de...\"}", "columns": ["id", "url", "title", "authors", "translators", "original_language", "genre", "source_edition", "text"], "columns_mapping": {"id": "id", "url": "url", "title": "title", "authors": "authors", "translators": "translators", "original_language": "original_language", "genre": "genre", "source_edition": "source_edition", "text": "text"}, "dataset_description": "This repository contains a dump of thousands of public domain works in Hebrew, from Project Ben-Yehuda, in plaintext UTF-8 files, with and without diacritics (nikkud). The metadata (pseudocatalogue.csv) file is a list of titles, authors, genres, and file paths, to help you process the dump.\nAll these works are in the public domain, so you are free to make any use of them, and do not need to ask for permission.\nThere are 10078 files, 3181136 lines\n", "dataset_name": "hebrew_projectbenyehuda"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:he"], "is_gated": false}, "hebrew_sentiment": {"dataset_name": "hebrew_sentiment", "description": "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel\u2019s\npresident, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder,\n2013) to scrape all the comments to all of the president\u2019s posts in the period of June \u2013 August 2014,\nthe first three months of Rivlin\u2019s presidency.2 While the president\u2019s posts aimed at reconciling tensions\nand called for tolerance and empathy, the sentiment expressed in the comments to the president\u2019s posts\nwas polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his\npolicy. Of the 12,804 comments, 370 are neutral; 8,512 are positive, 3,922 negative.\n\nData Annotation: A trained researcher examined each comment and determined its sentiment value,\nwhere comments with an overall positive sentiment were assigned the value 1, comments with an overall\nnegative sentiment were assigned the value -1, and comments that are off-topic to the post\u2019s content\nwere assigned the value 0. We validated the coding scheme by asking a second trained researcher to\ncode the same data. There was substantial agreement between raters (N of agreements: 10623, N of\ndisagreements: 2105, Coehn\u2019s Kappa = 0.697, p = 0).", "downloads": 500, "configs": {"token": {"config_name": "token", "sample_row": "{\"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9 \\\\u05db\\\\u05d5\\\\u05d0\\\\u05d1 ........\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel\u2019s\npresident, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder,\n2013) to scrape all the comments to all of the president\u2019s posts in the period of June \u2013 August 2014,\nthe first three months of Rivlin\u2019s presidency.2 While the president\u2019s posts aimed at reconciling tensions\nand called for tolerance and empathy, the sentiment expressed in the comments to the president\u2019s posts\nwas polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his\npolicy. Of the 12,804 comments, 370 are neutral; 8,512 are positive, 3,922 negative.\n\nData Annotation: A trained researcher examined each comment and determined its sentiment value,\nwhere comments with an overall positive sentiment were assigned the value 1, comments with an overall\nnegative sentiment were assigned the value -1, and comments that are off-topic to the post\u2019s content\nwere assigned the value 0. We validated the coding scheme by asking a second trained researcher to\ncode the same data. There was substantial agreement between raters (N of agreements: 10623, N of\ndisagreements: 2105, Coehn\u2019s Kappa = 0.697, p = 0).\n", "dataset_name": "hebrew_sentiment"}, "morph": {"config_name": "morph", "sample_row": "{\"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9 \\\\u05db\\\\u05d5\\\\u05d0\\\\u05d1 ........\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel\u2019s\npresident, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder,\n2013) to scrape all the comments to all of the president\u2019s posts in the period of June \u2013 August 2014,\nthe first three months of Rivlin\u2019s presidency.2 While the president\u2019s posts aimed at reconciling tensions\nand called for tolerance and empathy, the sentiment expressed in the comments to the president\u2019s posts\nwas polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his\npolicy. Of the 12,804 comments, 370 are neutral; 8,512 are positive, 3,922 negative.\n\nData Annotation: A trained researcher examined each comment and determined its sentiment value,\nwhere comments with an overall positive sentiment were assigned the value 1, comments with an overall\nnegative sentiment were assigned the value -1, and comments that are off-topic to the post\u2019s content\nwere assigned the value 0. We validated the coding scheme by asking a second trained researcher to\ncode the same data. There was substantial agreement between raters (N of agreements: 10623, N of\ndisagreements: 2105, Coehn\u2019s Kappa = 0.697, p = 0).\n", "dataset_name": "hebrew_sentiment"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:he"], "is_gated": false}, "Rowan/hellaswag": {"dataset_name": "Rowan/hellaswag", "description": "HellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.", "downloads": 2165, "configs": {"default": {"config_name": "default", "sample_row": "{\"ind\": \"4\", \"activity_label\": \"\\\"Removing ice from car\\\"\", \"ctx_a\": \"\\\"Then, the man writes over the snow covering the w...\", \"ctx_b\": \"\\\"then\\\"\", \"ctx\": \"\\\"Then, the man writes over the snow covering the w...\", \"endings\": \"[\\\", the man adds wax to the windshield and cuts it...\", \"source_id\": \"\\\"activitynet~v_-1IBHYS3L-Y\\\"\", \"split\": \"\\\"train\\\"\", \"split_type\": \"\\\"indomain\\\"\", \"label\": \"\\\"3\\\"\"}", "columns": ["ind", "activity_label", "ctx_a", "ctx_b", "ctx", "endings", "source_id", "split", "split_type", "label"], "columns_mapping": {"ind": "ind", "activity_label": "activity_label", "ctx_a": "ctx_a", "ctx_b": "ctx_b", "ctx": "ctx", "endings": "endings", "source_id": "source_id", "split": "split", "split_type": "split_type", "label": "label"}, "dataset_description": "\nHellaSwag: Can a Machine Really Finish Your Sentence? is a new dataset for commonsense NLI. A paper was published at ACL2019.\n", "dataset_name": "Rowan/hellaswag"}}, "tags": ["language:en"], "is_gated": false}, "hind_encorp": {"dataset_name": "hind_encorp", "description": "HindEnCorp parallel texts (sentence-aligned) come from the following sources:\nTides, which contains 50K sentence pairs taken mainly from news articles. This dataset was originally col- lected for the DARPA-TIDES surprise-language con- test in 2002, later refined at IIIT Hyderabad and provided for the NLP Tools Contest at ICON 2008 (Venkatapathy, 2008).\n\nCommentaries by Daniel Pipes contain 322 articles in English written by a journalist Daniel Pipes and translated into Hindi.\n\nEMILLE. This corpus (Baker et al., 2002) consists of three components: monolingual, parallel and annotated corpora. There are fourteen monolingual sub- corpora, including both written and (for some lan- guages) spoken data for fourteen South Asian lan- guages. The EMILLE monolingual corpora contain in total 92,799,000 words (including 2,627,000 words of transcribed spoken data for Bengali, Gujarati, Hindi, Punjabi and Urdu). The parallel corpus consists of 200,000 words of text in English and its accompanying translations into Hindi and other languages.\n\nSmaller datasets as collected by Bojar et al. (2010) include the corpus used at ACL 2005 (a subcorpus of EMILLE), a corpus of named entities from Wikipedia (crawled in 2009), and Agriculture domain parallel corpus.\n\ufffc\nFor the current release, we are extending the parallel corpus using these sources:\nIntercorp (\u010cerm\u00e1k and Rosen,2012) is a large multilingual parallel corpus of 32 languages including Hindi. The central language used for alignment is Czech. Intercorp\u2019s core texts amount to 202 million words. These core texts are most suitable for us because their sentence alignment is manually checked and therefore very reliable. They cover predominately short sto- ries and novels. There are seven Hindi texts in Inter- corp. Unfortunately, only for three of them the English translation is available; the other four are aligned only with Czech texts. The Hindi subcorpus of Intercorp contains 118,000 words in Hindi.\n\nTED talks 3 held in various languages, primarily English, are equipped with transcripts and these are translated into 102 languages. There are 179 talks for which Hindi translation is available.\n\nThe Indic multi-parallel corpus (Birch et al., 2011; Post et al., 2012) is a corpus of texts from Wikipedia translated from the respective Indian language into English by non-expert translators hired over Mechanical Turk. The quality is thus somewhat mixed in many respects starting from typesetting and punctuation over capi- talization, spelling, word choice to sentence structure. A little bit of control could be in principle obtained from the fact that every input sentence was translated 4 times. We used the 2012 release of the corpus.\n\nLaunchpad.net is a software collaboration platform that hosts many open-source projects and facilitates also collaborative localization of the tools. We downloaded all revisions of all the hosted projects and extracted the localization (.po) files.\n\nOther smaller datasets. This time, we added Wikipedia entities as crawled in 2013 (including any morphological variants of the named entitity that appears on the Hindi variant of the Wikipedia page) and words, word examples and quotes from the Shabdkosh online dictionary.", "downloads": 301, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"source\": \"\\\"wikiner2013inflected\\\"\", \"alignment_type\": \"\\\"1-1\\\"\", \"alignment_quality\": \"\\\"1.000\\\"\", \"translation.en\": \"\\\"Sharaabi\\\"\", \"translation.hi\": \"\\\"\\\\u0936\\\\u0930\\\\u093e\\\\u092c\\\\u0940\\\"\"}", "columns": ["id", "source", "alignment_type", "alignment_quality", "translation_en", "translation_hi"], "columns_mapping": {"id": "id", "source": "source", "alignment_type": "alignment_type", "alignment_quality": "alignment_quality", "translation.en": "translation_en", "translation.hi": "translation_hi"}, "dataset_description": "HindEnCorp parallel texts (sentence-aligned) come from the following sources:\nTides, which contains 50K sentence pairs taken mainly from news articles. This dataset was originally col- lected for the DARPA-TIDES surprise-language con- test in 2002, later refined at IIIT Hyderabad and provided for the NLP Tools Contest at ICON 2008 (Venkatapathy, 2008).\n\nCommentaries by Daniel Pipes contain 322 articles in English written by a journalist Daniel Pipes and translated into Hindi.\n\nEMILLE. This corpus (Baker et al., 2002) consists of three components: monolingual, parallel and annotated corpora. There are fourteen monolingual sub- corpora, including both written and (for some lan- guages) spoken data for fourteen South Asian lan- guages. The EMILLE monolingual corpora contain in total 92,799,000 words (including 2,627,000 words of transcribed spoken data for Bengali, Gujarati, Hindi, Punjabi and Urdu). The parallel corpus consists of 200,000 words of text in English and its accompanying translations into Hindi and other languages.\n\nSmaller datasets as collected by Bojar et al. (2010) include the corpus used at ACL 2005 (a subcorpus of EMILLE), a corpus of named entities from Wikipedia (crawled in 2009), and Agriculture domain parallel corpus.\n\ufffc\nFor the current release, we are extending the parallel corpus using these sources:\nIntercorp (\u010cerm\u00e1k and Rosen,2012) is a large multilingual parallel corpus of 32 languages including Hindi. The central language used for alignment is Czech. Intercorp\u2019s core texts amount to 202 million words. These core texts are most suitable for us because their sentence alignment is manually checked and therefore very reliable. They cover predominately short sto- ries and novels. There are seven Hindi texts in Inter- corp. Unfortunately, only for three of them the English translation is available; the other four are aligned only with Czech texts. The Hindi subcorpus of Intercorp contains 118,000 words in Hindi.\n\nTED talks 3 held in various languages, primarily English, are equipped with transcripts and these are translated into 102 languages. There are 179 talks for which Hindi translation is available.\n\nThe Indic multi-parallel corpus (Birch et al., 2011; Post et al., 2012) is a corpus of texts from Wikipedia translated from the respective Indian language into English by non-expert translators hired over Mechanical Turk. The quality is thus somewhat mixed in many respects starting from typesetting and punctuation over capi- talization, spelling, word choice to sentence structure. A little bit of control could be in principle obtained from the fact that every input sentence was translated 4 times. We used the 2012 release of the corpus.\n\nLaunchpad.net is a software collaboration platform that hosts many open-source projects and facilitates also collaborative localization of the tools. We downloaded all revisions of all the hosted projects and extracted the localization (.po) files.\n\nOther smaller datasets. This time, we added Wikipedia entities as crawled in 2013 (including any morphological variants of the named entitity that appears on the Hindi variant of the Wikipedia page) and words, word examples and quotes from the Shabdkosh online dictionary.\n", "dataset_name": "hind_encorp"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:hi"], "is_gated": false}, "hindi_discourse": {"dataset_name": "hindi_discourse", "description": "The Hindi Discourse Analysis dataset is a corpus for analyzing discourse modes present in its sentences.\nIt contains sentences from stories written by 11 famous authors from the 20th Century.\n4-5 stories by each author have been selected which were available in the public domain resulting\nin a collection of 53 stories. Most of these short stories were originally written in Hindi\nbut some of them were written in other Indian languages and later translated to Hindi.", "downloads": 297, "configs": {"default": {"config_name": "default", "sample_row": "{\"Story_no\": \"0\", \"Sentence\": \"\\\"\\\\u091a\\\\u0947\\\\u0939\\\\u0930\\\\u0947 \\\\u092a\\\\u0930 \\\\u090...\", \"Discourse Mode\": \"1\"}", "columns": ["Story_no", "Sentence", "Discourse Mode"], "columns_mapping": {"Story_no": "Story_no", "Sentence": "Sentence", "Discourse Mode": "Discourse Mode"}, "dataset_description": "The Hindi Discourse Analysis dataset is a corpus for analyzing discourse modes present in its sentences.\nIt contains sentences from stories written by 11 famous authors from the 20th Century.\n4-5 stories by each author have been selected which were available in the public domain resulting\nin a collection of 53 stories. Most of these short stories were originally written in Hindi\nbut some of them were written in other Indian languages and later translated to Hindi.\n", "dataset_name": "hindi_discourse"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:hi", "discourse-analysis"], "is_gated": false}, "hkcancor": {"dataset_name": "hkcancor", "description": "The Hong Kong Cantonese Corpus (HKCanCor) comprise transcribed conversations\nrecorded between March 1997 and August 1998. It contains recordings of\nspontaneous speech (51 texts) and radio programmes (42 texts),\nwhich involve 2 to 4 speakers, with 1 text of monologue.\n\nIn total, the corpus contains around 230,000 Chinese words.\nThe text is word-segmented, annotated with part-of-speech (POS) tags and\nromanised Cantonese pronunciation.\n\nRomanisation scheme - Linguistic Society of Hong Kong (LSHK)\nPOS scheme - Peita-Fujitsu-Renmin Ribao (PRF) corpus (Duan et al., 2000),\n with extended tags for Cantonese-specific phenomena added by\n Luke and Wang (see original paper for details).", "downloads": 316, "configs": {"default": {"config_name": "default", "sample_row": "{\"conversation_id\": \"\\\"TN001-DR300497-WAI3C\\\"\", \"speaker\": \"\\\"A\\\"\", \"turn_number\": \"0\", \"tokens\": \"[\\\"\\\\u5582\\\", \\\"\\\\u9072\\\", \\\"\\\\u5572\\\", \\\"\\\\u53bb\\\", \\\"\\\\u5514\\\",...\", \"transcriptions\": \"[\\\"wai3\\\", \\\"ci4\\\", \\\"di1\\\", \\\"heoi3\\\", \\\"m4\\\", \\\"heoi3\\\", \\\"le...\", \"pos_tags_prf\": \"[24, 9, 72, 75, 21, 75, 80, 116, 83, 64, 50, 76, 9...\", \"pos_tags_ud\": \"[15, 0, 6, 11, 10, 11, 8, 6, 2, 14, 8, 11, 0, 8, 6...\"}", "columns": ["conversation_id", "speaker", "turn_number", "tokens", "transcriptions", "pos_tags_prf", "pos_tags_ud"], "columns_mapping": {"conversation_id": "conversation_id", "speaker": "speaker", "turn_number": "turn_number", "tokens": "tokens", "transcriptions": "transcriptions", "pos_tags_prf": "pos_tags_prf", "pos_tags_ud": "pos_tags_ud"}, "dataset_description": "The Hong Kong Cantonese Corpus (HKCanCor) comprise transcribed conversations\nrecorded between March 1997 and August 1998. It contains recordings of\nspontaneous speech (51 texts) and radio programmes (42 texts),\nwhich involve 2 to 4 speakers, with 1 text of monologue.\n\nIn total, the corpus contains around 230,000 Chinese words.\nThe text is word-segmented, annotated with part-of-speech (POS) tags and\nromanised Cantonese pronunciation.\n\nRomanisation scheme - Linguistic Society of Hong Kong (LSHK)\nPOS scheme - Peita-Fujitsu-Renmin Ribao (PRF) corpus (Duan et al., 2000),\n with extended tags for Cantonese-specific phenomena added by\n Luke and Wang (see original paper for details).\n", "dataset_name": "hkcancor"}}, "tags": ["task_categories:translation", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:yue"], "is_gated": false}, "hlgd": {"dataset_name": "hlgd", "description": "HLGD is a binary classification dataset consisting of 20,056 labeled news headlines pairs indicating\nwhether the two headlines describe the same underlying world event or not.", "downloads": 365, "configs": {"default": {"config_name": "default", "sample_row": "{\"timeline_id\": \"9\", \"headline_a\": \"\\\"Seven bodies found after dam burst at Brazil mine...\", \"headline_b\": \"\\\"Fears rise for 300 missing in Brazil dam disaster...\", \"date_a\": \"\\\"2019-01-25\\\"\", \"date_b\": \"\\\"2019-01-26\\\"\", \"url_a\": \"\\\"https://www.reuters.com/article/us-brazil-vale-di...\", \"url_b\": \"\\\"https://timesofindia.indiatimes.com/world/rest-of...\", \"label\": \"0\"}", "columns": ["timeline_id", "headline_a", "headline_b", "date_a", "date_b", "url_a", "url_b", "label"], "columns_mapping": {"timeline_id": "timeline_id", "headline_a": "headline_a", "headline_b": "headline_b", "date_a": "date_a", "date_b": "date_b", "url_a": "url_a", "url_b": "url_b", "label": "label"}, "dataset_description": "HLGD is a binary classification dataset consisting of 20,056 labeled news headlines pairs indicating\nwhether the two headlines describe the same underlying world event or not.\n", "dataset_name": "hlgd"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "headline-grouping"], "is_gated": false}, "hover": {"dataset_name": "hover", "description": "HoVer is an open-domain, many-hop fact extraction and claim verification dataset built upon the Wikipedia corpus. The original 2-hop claims are adapted from question-answer pairs from HotpotQA. It is collected by a team of NLP researchers at UNC Chapel Hill and Verisk Analytics.", "downloads": 303, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"uid\": \"\\\"330ca632-e83f-4011-b11b-0d0158145036\\\"\", \"claim\": \"\\\"Skagen Painter Peder Severin Kr\\\\u00f8yer favored ...\", \"supporting_facts\": \"[{\\\"key\\\": \\\"Kristian Zahrtmann\\\", \\\"value\\\": 0}, {\\\"key\\\"...\", \"label\": \"1\", \"num_hops\": \"3\", \"hpqa_id\": \"\\\"5ab7a86d5542995dae37e986\\\"\"}", "columns": ["id", "uid", "claim", "supporting_facts", "label", "num_hops", "hpqa_id"], "columns_mapping": {"id": "id", "uid": "uid", "claim": "claim", "supporting_facts": "supporting_facts", "label": "label", "num_hops": "num_hops", "hpqa_id": "hpqa_id"}, "dataset_description": "HoVer is an open-domain, many-hop fact extraction and claim verification dataset built upon the Wikipedia corpus. The original 2-hop claims are adapted from question-answer pairs from HotpotQA. It is collected by a team of NLP researchers at UNC Chapel Hill and Verisk Analytics.\n", "dataset_name": "hover"}}, "tags": ["task_categories:text-retrieval", "task_ids:fact-checking-retrieval", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "hrenwac_para": {"dataset_name": "hrenwac_para", "description": "The hrenWaC corpus version 2.0 consists of parallel Croatian-English texts crawled from the .hr top-level domain for Croatia.\nThe corpus was built with Spidextor (https://github.com/abumatran/spidextor), a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext on the segment level is around 80% and on the word level around 84%.", "downloads": 288, "configs": {"hrenWaC": {"config_name": "hrenWaC", "sample_row": "{\"translation.en\": \"\\\"There is probably no person in the world that the...\", \"translation.hr\": \"\\\"Vjerojatno ne postoji osoba na svijetu koja na vi...\"}", "columns": ["translation_en", "translation_hr"], "columns_mapping": {"translation.en": "translation_en", "translation.hr": "translation_hr"}, "dataset_description": "\nThe hrenWaC corpus version 2.0 consists of parallel Croatian-English texts crawled from the .hr top-level domain for Croatia.\nThe corpus was built with Spidextor (https://github.com/abumatran/spidextor), a tool that glues together the output of SpiderLing used for crawling and Bitextor used for bitext extraction. The accuracy of the extracted bitext on the segment level is around 80% and on the word level around 84%.\n", "dataset_name": "hrenwac_para"}}, "tags": ["task_categories:translation", "annotations_creators:no-annotation", "multilinguality:translation", "source_datasets:original", "language:en", "language:hr"], "is_gated": false}, "humicroedit": {"dataset_name": "humicroedit", "description": "This new dataset is designed to assess the funniness of edited news headlines.", "downloads": 595, "configs": {"subtask-1": {"config_name": "subtask-1", "sample_row": "{\"id\": \"\\\"14530\\\"\", \"original\": \"\\\"France is \\\\u2018 hunting down its citizens who jo...\", \"edit\": \"\\\"twins\\\"\", \"grades\": \"\\\"10000\\\"\", \"meanGrade\": \"0.2\"}", "columns": ["id", "original", "edit", "grades", "meanGrade"], "columns_mapping": {"id": "id", "original": "original", "edit": "edit", "grades": "grades", "meanGrade": "meanGrade"}, "dataset_description": "This new dataset is designed to assess the funniness of edited news headlines.\n", "dataset_name": "humicroedit"}, "subtask-2": {"config_name": "subtask-2", "sample_row": "{\"id\": \"\\\"10920-9866\\\"\", \"original1\": \"\\\"\\\\\\\" Gene Cernan , Last on the Moon , ...\", \"edit1\": \"\\\"Dancer\\\"\", \"grades1\": \"\\\"01113\\\"\", \"meanGrade1\": \"1.2\", \"original2\": \"\\\"\\\\\\\" Gene Cernan , Last Astronaut on the Moon , \\\"\", \"paraphrased_question\": \"\\\"What is Delta Air Line's periodical literature mo...\"}", "columns": ["NNQT_question", "uid", "subgraph", "template_index", "question", "sparql_wikidata", "sparql_dbpedia18", "template", "paraphrased_question"], "columns_mapping": {"NNQT_question": "NNQT_question", "uid": "uid", "subgraph": "subgraph", "template_index": "template_index", "question": "question", "sparql_wikidata": "sparql_wikidata", "sparql_dbpedia18": "sparql_dbpedia18", "template": "template", "paraphrased_question": "paraphrased_question"}, "dataset_description": "LC-QuAD 2.0 is a Large Question Answering dataset with 30,000 pairs of question and its corresponding SPARQL query. The target knowledge base is Wikidata and DBpedia, specifically the 2018 version. Please see our paper for details about the dataset creation process and framework.\n", "dataset_name": "lc_quad"}}, "tags": ["task_categories:question-answering", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "knowledge-base-qa"], "is_gated": false}, "lener_br": {"dataset_name": "lener_br", "description": "LeNER-Br is a Portuguese language dataset for named entity recognition\napplied to legal documents. LeNER-Br consists entirely of manually annotated\nlegislation and legal cases texts and contains tags for persons, locations,\ntime entities, organizations, legislation and legal cases.\nTo compose the dataset, 66 legal documents from several Brazilian Courts were\ncollected. Courts of superior and state levels were considered, such as Supremo\nTribunal Federal, Superior Tribunal de Justi\u00e7a, Tribunal de Justi\u00e7a de Minas\nGerais and Tribunal de Contas da Uni\u00e3o. In addition, four legislation documents\nwere collected, such as \"Lei Maria da Penha\", giving a total of 70 documents", "downloads": 383, "configs": {"lener_br": {"config_name": "lener_br", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"EMENTA\\\", \\\":\\\", \\\"APELA\\\\u00c7\\\\u00c3O\\\", \\\"C\\\\u00cdVEL\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nLeNER-Br is a Portuguese language dataset for named entity recognition\napplied to legal documents. LeNER-Br consists entirely of manually annotated\nlegislation and legal cases texts and contains tags for persons, locations,\ntime entities, organizations, legislation and legal cases.\nTo compose the dataset, 66 legal documents from several Brazilian Courts were\ncollected. Courts of superior and state levels were considered, such as Supremo\nTribunal Federal, Superior Tribunal de Justi\u00e7a, Tribunal de Justi\u00e7a de Minas\nGerais and Tribunal de Contas da Uni\u00e3o. In addition, four legislation documents\nwere collected, such as \"Lei Maria da Penha\", giving a total of 70 documents\n", "dataset_name": "lener_br"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt", "legal"], "is_gated": false}, "liar": {"dataset_name": "liar", "description": "LIAR is a dataset for fake news detection with 12.8K human labeled short statements from politifact.com's API, and each statement is evaluated by a politifact.com editor for its truthfulness. The distribution of labels in the LIAR dataset is relatively well-balanced: except for 1,050 pants-fire cases, the instances for all other labels range from 2,063 to 2,638. In each case, the labeler provides a lengthy analysis report to ground each judgment.", "downloads": 1140, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"2635.json\\\"\", \"label\": \"0\", \"statement\": \"\\\"Says the Annies List political group supports thi...\", \"subject\": \"\\\"abortion\\\"\", \"speaker\": \"\\\"dwayne-bohac\\\"\", \"job_title\": \"\\\"State representative\\\"\", \"state_info\": \"\\\"Texas\\\"\", \"party_affiliation\": \"\\\"republican\\\"\", \"barely_true_counts\": \"0.0\", \"false_counts\": \"1.0\", \"half_true_counts\": \"0.0\", \"mostly_true_counts\": \"0.0\", \"pants_on_fire_counts\": \"0.0\", \"context\": \"\\\"a mailer\\\"\"}", "columns": ["id", "label", "statement", "subject", "speaker", "job_title", "state_info", "party_affiliation", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"], "columns_mapping": {"id": "id", "label": "label", "statement": "statement", "subject": "subject", "speaker": "speaker", "job_title": "job_title", "state_info": "state_info", "party_affiliation": "party_affiliation", "barely_true_counts": "barely_true_counts", "false_counts": "false_counts", "half_true_counts": "half_true_counts", "mostly_true_counts": "mostly_true_counts", "pants_on_fire_counts": "pants_on_fire_counts", "context": "context"}, "dataset_description": "LIAR is a dataset for fake news detection with 12.8K human labeled short statements from politifact.com's API, and each statement is evaluated by a politifact.com editor for its truthfulness. The distribution of labels in the LIAR dataset is relatively well-balanced: except for 1,050 pants-fire cases, the instances for all other labels range from 2,063 to 2,638. In each case, the labeler provides a lengthy analysis report to ground each judgment.\n", "dataset_name": "liar"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "fake-news-detection"], "is_gated": false}, "librispeech_lm": {"dataset_name": "librispeech_lm", "description": "Language modeling resources to be used in conjunction with the LibriSpeech ASR corpus.", "downloads": 306, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"A\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Language modeling resources to be used in conjunction with the LibriSpeech ASR corpus.\n", "dataset_name": "librispeech_lm"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "limit": {"dataset_name": "limit", "description": "Motion recognition is one of the basic cognitive capabilities of many life forms, yet identifying motion of physical entities in natural language have not been explored extensively and empirically. Literal-Motion-in-Text (LiMiT) dataset, is a large human-annotated collection of English text sentences describing physical occurrence of motion, with annotated physical entities in motion.", "downloads": 392, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"sentence\": \"\\\" A little boy holding a yellow ball walks by.\\\"\", \"motion\": \"\\\"yes\\\"\", \"motion_entities\": \"[{\\\"entity\\\": \\\"little boy\\\", \\\"start_index\\\": 2}, {\\\"ent...\"}", "columns": ["id", "sentence", "motion", "motion_entities"], "columns_mapping": {"id": "id", "sentence": "sentence", "motion": "motion", "motion_entities": "motion_entities"}, "dataset_description": "Motion recognition is one of the basic cognitive capabilities of many life forms, yet identifying motion of physical entities in natural language have not been explored extensively and empirically. Literal-Motion-in-Text (LiMiT) dataset, is a large human-annotated collection of English text sentences describing physical occurrence of motion, with annotated physical entities in motion.\n", "dataset_name": "limit"}}, "tags": ["task_categories:token-classification", "task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:named-entity-recognition", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|net-activities-captions", "source_datasets:original", "language:en"], "is_gated": false}, "linnaeus": {"dataset_name": "linnaeus", "description": "A novel corpus of full-text documents manually annotated for species mentions.", "downloads": 305, "configs": {"linnaeus": {"config_name": "linnaeus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Scp160p\\\", \\\",\\\", \\\"a\\\", \\\"multiple\\\", \\\"KH\\\", \\\"-\\\", \\\"doma...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "A novel corpus of full-text documents manually annotated for species mentions.\n", "dataset_name": "linnaeus"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "lm1b": {"dataset_name": "lm1b", "description": "A benchmark corpus to be used for measuring progress in statistical language modeling. This has almost one billion words in the training data.", "downloads": 1029, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"While athletes in different professions dealt wit...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "A benchmark corpus to be used for measuring progress in statistical language modeling. This has almost one billion words in the training data.\n", "dataset_name": "lm1b"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "language:en"], "is_gated": false}, "mac_morpho": {"dataset_name": "mac_morpho", "description": "Mac-Morpho is a corpus of Brazilian Portuguese texts annotated with part-of-speech tags.\nIts first version was released in 2003 [1], and since then, two revisions have been made in order\nto improve the quality of the resource [2, 3].\nThe corpus is available for download split into train, development and test sections.\nThese are 76%, 4% and 20% of the corpus total, respectively (the reason for the unusual numbers\nis that the corpus was first split into 80%/20% train/test, and then 5% of the train section was\nset aside for development). This split was used in [3], and new POS tagging research with Mac-Morpho\nis encouraged to follow it in order to make consistent comparisons possible.\n\n\n[1] Alu\u00edsio, S., Pelizzoni, J., Marchi, A.R., de Oliveira, L., Manenti, R., Marquiaf\u00e1vel, V. 2003.\nAn account of the challenge of tagging a reference corpus for brazilian portuguese.\nIn: Proceedings of the 6th International Conference on Computational Processing of the Portuguese Language. PROPOR 2003\n\n[2] Fonseca, E.R., Rosa, J.L.G. 2013. Mac-morpho revisited: Towards robust part-of-speech.\nIn: Proceedings of the 9th Brazilian Symposium in Information and Human Language Technology \u2013 STIL\n\n[3] Fonseca, E.R., Alu\u00edsio, Sandra Maria, Rosa, J.L.G. 2015.\nEvaluating word embeddings and a revised corpus for part-of-speech tagging in Portuguese.\nJournal of the Brazilian Computer Society.", "downloads": 286, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Jersei\\\", \\\"atinge\\\", \\\"m\\\\u00e9dia\\\", \\\"de\\\", \\\"Cr$\\\", \\\"1...\", \"pos_tags\": \"[14, 19, 14, 15, 22, 7, 14, 9, 14, 9, 3, 15, 3, 3,...\"}", "columns": ["id", "tokens", "pos_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags"}, "dataset_description": "\nMac-Morpho is a corpus of Brazilian Portuguese texts annotated with part-of-speech tags.\nIts first version was released in 2003 [1], and since then, two revisions have been made in order\nto improve the quality of the resource [2, 3].\nThe corpus is available for download split into train, development and test sections.\nThese are 76%, 4% and 20% of the corpus total, respectively (the reason for the unusual numbers\nis that the corpus was first split into 80%/20% train/test, and then 5% of the train section was\nset aside for development). This split was used in [3], and new POS tagging research with Mac-Morpho\nis encouraged to follow it in order to make consistent comparisons possible.\n\n\n[1] Alu\u00edsio, S., Pelizzoni, J., Marchi, A.R., de Oliveira, L., Manenti, R., Marquiaf\u00e1vel, V. 2003.\nAn account of the challenge of tagging a reference corpus for brazilian portuguese.\nIn: Proceedings of the 6th International Conference on Computational Processing of the Portuguese Language. PROPOR 2003\n\n[2] Fonseca, E.R., Rosa, J.L.G. 2013. Mac-morpho revisited: Towards robust part-of-speech.\nIn: Proceedings of the 9th Brazilian Symposium in Information and Human Language Technology \u2013 STIL\n\n[3] Fonseca, E.R., Alu\u00edsio, Sandra Maria, Rosa, J.L.G. 2015.\nEvaluating word embeddings and a revised corpus for part-of-speech tagging in Portuguese.\nJournal of the Brazilian Computer Society.\n", "dataset_name": "mac_morpho"}}, "tags": ["task_categories:token-classification", "task_ids:part-of-speech", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "masakhaner": {"dataset_name": "masakhaner", "description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811", "downloads": 2048, "configs": {"amh": {"config_name": "amh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u1240\\\\u12f3\\\\u121a\\\\u12cd\\\", \\\"\\\\u12e8\\\\u1236\\\\u121b\\\\u...\", \"ner_tags\": \"[0, 5, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "hau": {"config_name": "hau", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"A\\\", \\\"saurari\\\", \\\"cikakken\\\", \\\"rahoton\\\", \\\"wakilin\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 3, 4, 1, 2]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "ibo": {"config_name": "ibo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ike\\\", \\\"\\\\u1ecbda\\\", \\\"j\\\\u1ee5\\\\u1ee5\\\", \\\"ot\\\\u1ee5\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "kin": {"config_name": "kin", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ambasaderi\\\", \\\"wa\\\", \\\"EU\\\", \\\"mu\\\", \\\"Rwanda\\\", \\\",\\\", \\\"N...\", \"ner_tags\": \"[0, 0, 3, 0, 5, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "lug": {"config_name": "lug", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Empaka\\\", \\\"zaakubeera\\\", \\\"mu\\\", \\\"kibuga\\\", \\\"Liverpoo...\", \"ner_tags\": \"[0, 0, 0, 0, 5, 0, 5, 0, 0, 0, 7, 8, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "luo": {"config_name": "luo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\ufeffJii\\\", \\\"2\\\", \\\"moko\\\", \\\"jowito\\\", \\\"ngimagi\\\", \\\"k...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "pcm": {"config_name": "pcm", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Mixed\\\", \\\"Martial\\\", \\\"Arts\\\", \\\"joinbodi\\\", \\\",\\\", \\\"Ult...\", \"ner_tags\": \"[3, 4, 4, 0, 0, 3, 4, 4, 0, 3, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "swa": {"config_name": "swa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Wizara\\\", \\\"ya\\\", \\\"afya\\\", \\\"ya\\\", \\\"Tanzania\\\", \\\"imerip...\", \"ner_tags\": \"[3, 4, 4, 4, 4, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "wol": {"config_name": "wol", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"SAFIYETU\\\", \\\"B\\\\u00c9EY\\\", \\\"C\\\\u00e9y\\\", \\\"Koronaa\\\", \\\"...\", \"ner_tags\": \"[1, 2, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}, "yor": {"config_name": "yor", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"K\\\\u00f2\\\", \\\"s\\\\u00ed\\\", \\\"\\\\u1eb9\\\\u0300r\\\\u00ed\\\", \\\"t\\\\u...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER is the first large publicly available high-quality dataset for named entity recognition (NER) in ten African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for ten African languages:\n- Amharic\n- Hausa\n- Igbo\n- Kinyarwanda\n- Luganda\n- Luo\n- Nigerian-Pidgin\n- Swahili\n- Wolof\n- Yoruba\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhaner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:am", "language:ha", "language:ig", "language:lg", "language:luo", "language:pcm", "language:rw", "language:sw", "language:wo", "language:yo"], "is_gated": false}, "math_qa": {"dataset_name": "math_qa", "description": "Our dataset is gathered by using a new representation language to annotate over the AQuA-RAT dataset. AQuA-RAT has provided the questions, options, rationale, and the correct options.", "downloads": 31555, "configs": {"default": {"config_name": "default", "sample_row": "{\"Problem\": \"\\\"the banker ' s gain of a certain sum due 3 years ...\", \"Rationale\": \"\\\"\\\\\\\"explanation : t = 3 years r = 10 % td = ( bg \\\\u...\", \"options\": \"\\\"a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) ...\", \"correct\": \"\\\"a\\\"\", \"annotated_formula\": \"\\\"divide(multiply(const_100, divide(multiply(36, co...\", \"linear_formula\": \"\\\"multiply(n2,const_100)|multiply(n0,n1)|divide(#0,...\", \"category\": \"\\\"gain\\\"\"}", "columns": ["Problem", "Rationale", "options", "correct", "annotated_formula", "linear_formula", "category"], "columns_mapping": {"Problem": "Problem", "Rationale": "Rationale", "options": "options", "correct": "correct", "annotated_formula": "annotated_formula", "linear_formula": "linear_formula", "category": "category"}, "dataset_description": "\nOur dataset is gathered by using a new representation language to annotate over the AQuA-RAT dataset. AQuA-RAT has provided the questions, options, rationale, and the correct options.\n", "dataset_name": "math_qa"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|aqua_rat", "language:en"], "is_gated": false}, "mbpp": {"dataset_name": "mbpp", "description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been\nhand-verified by the authors.", "downloads": 12392, "configs": {"full": {"config_name": "full", "sample_row": "{\"task_id\": \"601\", \"text\": \"\\\"Write a function to find the longest chain which ...\", \"code\": \"\\\"class Pair(object): \\\\r\\\\n\\\\tdef __init__(self, a, b...\", \"test_list\": \"[\\\"assert max_chain_length([Pair(5, 24), Pair(15, 2...\", \"test_setup_code\": \"\\\"\\\"\", \"challenge_test_list\": \"[]\"}", "columns": ["task_id", "text", "code", "test_list", "test_setup_code", "challenge_test_list"], "columns_mapping": {"task_id": "task_id", "text": "text", "code": "code", "test_list": "test_list", "test_setup_code": "test_setup_code", "challenge_test_list": "challenge_test_list"}, "dataset_description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been\nhand-verified by the authors.\n", "dataset_name": "mbpp"}, "sanitized": {"config_name": "sanitized", "sample_row": "{\"source_file\": \"\\\"Benchmark Questions Verification V2.ipynb\\\"\", \"task_id\": \"602\", \"prompt\": \"\\\"Write a python function to find the first repeate...\", \"code\": \"\\\"def first_repeated_char(str1):\\\\n for index,c in ...\", \"test_imports\": \"[]\", \"test_list\": \"[\\\"assert first_repeated_char(\\\\\\\"abcabc\\\\\\\") == \\\\\\\"a\\\\\\\"\\\"...\"}", "columns": ["source_file", "task_id", "prompt", "code", "test_imports", "test_list"], "columns_mapping": {"source_file": "source_file", "task_id": "task_id", "prompt": "prompt", "code": "code", "test_imports": "test_imports", "test_list": "test_list"}, "dataset_description": "The MBPP (Mostly Basic Python Problems) dataset consists of around 1,000 crowd-sourced Python\nprogramming problems, designed to be solvable by entry level programmers, covering programming\nfundamentals, standard library functionality, and so on. Each problem consists of a task\ndescription, code solution and 3 automated test cases. The sanitized subset of the data has been\nhand-verified by the authors.\n", "dataset_name": "mbpp"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "code-generation"], "is_gated": false}, "mc4": {"dataset_name": "mc4", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.", "downloads": 20390, "configs": {"af": {"config_name": "af", "sample_row": "{\"text\": \"\\\"Toe was daar nie plek vir telling teen Ikeys | Ne...\", \"timestamp\": \"\\\"2018-11-19T07:24:51Z\\\"\", \"url\": \"\\\"https://www.netwerk24.com/Sport/Rugby/toe-was-daa...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "am": {"config_name": "am", "sample_row": "{\"text\": \"\\\"\\\\u1260\\\\u1309\\\\u122d\\\\u121d\\\\u1235\\\\u1293 \\\\u12d5\\\\u12f5...\", \"timestamp\": \"\\\"2019-06-20T13:32:25Z\\\"\", \"url\": \"\\\"https://malvorlagen-seite.de/am/pubertaet-bei-jug...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ar": {"config_name": "ar", "sample_row": "{\"text\": \"\\\"\\\\\\\"\\\\u062e\\\\u0644\\\\u064a \\\\u0648\\\\u0631\\\\u0642\\\\u062a\\\\u06...\", \"timestamp\": \"\\\"2018-11-14T08:51:59Z\\\"\", \"url\": \"\\\"http://www.ghadinews.net/newsdet.aspx?id=8909&id2...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "az": {"config_name": "az", "sample_row": "{\"text\": \"\\\"M\\\\u00fchacir\\\\u0259t m\\\\u00f6vzusunun \\\\u00f6yr\\\\u025...\", \"timestamp\": \"\\\"2019-01-23T08:22:09Z\\\"\", \"url\": \"\\\"https://azertag.az/xeber/Muhaciret_movzusunun_oyr...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "be": {"config_name": "be", "sample_row": "{\"text\": \"\\\"\\\\u0410\\\\u0434\\\\u0437\\\\u0456\\\\u043d \\\\u043c\\\\u0430\\\\u043b...\", \"timestamp\": \"\\\"2019-02-20T00:21:49Z\\\"\", \"url\": \"\\\"http://uzv.by/adzin-malenki-uspamin-z-dzyacinstva...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "bg": {"config_name": "bg", "sample_row": "{\"text\": \"\\\"\\\\u0410\\\\u043c\\\\u0435\\\\u0440\\\\u0438\\\\u043a\\\\u0430\\\\u043d\\\\...\", \"timestamp\": \"\\\"2020-05-31T09:29:33Z\\\"\", \"url\": \"\\\"http://www.spacenewsbg.com/news/29/April/2020/537...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "bg-Latn": {"config_name": "bg-Latn", "sample_row": "{\"text\": \"\\\"Prezzi e Quotazioni Aggiornate 2017 Opel Astra-5-...\", \"timestamp\": \"\\\"2017-10-17T20:44:56Z\\\"\", \"url\": \"\\\"http://listino.infomotori.com/quotazione_usato/op...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "bn": {"config_name": "bn", "sample_row": "{\"text\": \"\\\"\\\\u09a6\\\\u09cd\\\\u09ac\\\\u09bf\\\\u09a4\\\\u09c0\\\\u09df \\\\u09ae...\", \"timestamp\": \"\\\"2019-09-19T00:07:47Z\\\"\", \"url\": \"\\\"http://dailysylhet.com/details/419696\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ca": {"config_name": "ca", "sample_row": "{\"text\": \"\\\"Les croades by Oriol_ins_front_m... 999 views\\\\nVi...\", \"timestamp\": \"\\\"2019-07-20T05:40:39Z\\\"\", \"url\": \"\\\"https://www.slideshare.net/quarteso/el-reino-delo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ceb": {"config_name": "ceb", "sample_row": "{\"text\": \"\\\"Khet Rat Burana - Wikipedia\\\\nTiganos: 13\\\\u00b040\\\\...\", \"timestamp\": \"\\\"2020-08-07T05:31:55Z\\\"\", \"url\": \"\\\"https://ceb.wikipedia.org/wiki/Khet_Rat_Burana\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "co": {"config_name": "co", "sample_row": "{\"text\": \"\\\"Prima pagina FEMINA CLUB Vin rosu cu banane 27 Ju...\", \"timestamp\": \"\\\"2017-07-27T14:49:10Z\\\"\", \"url\": \"\\\"http://www.revistamagazin.ro/content/view/5247/5/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "cs": {"config_name": "cs", "sample_row": "{\"text\": \"\\\"Kempy & Soust\\\\u0159ed\\\\u011bn\\\\u00ed Aerobik klub O...\", \"timestamp\": \"\\\"2020-01-29T18:58:35Z\\\"\", \"url\": \"\\\"http://www.aerobikolomouc.cz/ak-olomouc/kempy-201...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "cy": {"config_name": "cy", "sample_row": "{\"text\": \"\\\"Red River: \\\\u039f\\\\u03c1\\\\u03b9\\\\u03c3\\\\u03bc\\\\u03ad\\\\u...\", \"timestamp\": \"\\\"2018-03-22T15:56:46Z\\\"\", \"url\": \"\\\"http://followtheredriver.blogspot.com/2012/06/blo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "da": {"config_name": "da", "sample_row": "{\"text\": \"\\\"Om\\\\u00f8 - Wikipedia's Om\\\\u00f8 as translated by ...\", \"timestamp\": \"\\\"2020-08-11T10:15:26Z\\\"\", \"url\": \"\\\"https://dan.wikitrans.net/Om%C3%B8\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "de": {"config_name": "de", "sample_row": "{\"text\": \"\\\"Home - Homepage des Kunstvereins Pro Ars Lausitz ...\", \"timestamp\": \"\\\"2018-01-20T18:56:35Z\\\"\", \"url\": \"\\\"http://proarslausitz.de/1.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "el": {"config_name": "el", "sample_row": "{\"text\": \"\\\"\\\\u03a4\\\\u03b1 \\\\u03ba\\\\u03b1\\\\u03bb\\\\u03cd\\\\u03c4\\\\u03b5...\", \"timestamp\": \"\\\"2017-07-21T19:11:04Z\\\"\", \"url\": \"\\\"https://www.tripadvisor.com.gr/Restaurants-g61240...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "el-Latn": {"config_name": "el-Latn", "sample_row": "{\"text\": \"\\\"Art.No.: VB-200108-10-H\\\\nUrsula writes: 06.08.201...\", \"timestamp\": \"\\\"2019-08-19T12:28:32Z\\\"\", \"url\": \"\\\"https://www.vivobarefoot.de/en/ladies/vivobarefoo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "en": {"config_name": "en", "sample_row": "{\"text\": \"\\\"Posts 4,362\\\\tMore Info\\\\nOkay so to those of you t...\", \"timestamp\": \"\\\"2014-03-09T04:06:28Z\\\"\", \"url\": \"\\\"http://www.polkaudio.com/forums/showthread.php?58...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "eo": {"config_name": "eo", "sample_row": "{\"text\": \"\\\"Oberiu - Wikipedia's Oberiu as translated by Gram...\", \"timestamp\": \"\\\"2019-06-16T09:35:01Z\\\"\", \"url\": \"\\\"https://epo.wikitrans.net/Oberiu\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "es": {"config_name": "es", "sample_row": "{\"text\": \"\\\"Comprar Zapatillas para ni\\\\u00f1a en chancla con ...\", \"timestamp\": \"\\\"2019-01-18T17:11:30Z\\\"\", \"url\": \"\\\"https://www.calzadoslabalear.com/es/zapatillas-mu...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "et": {"config_name": "et", "sample_row": "{\"text\": \"\\\"EUROPAELi avatud andmete portaal Andmed Andmete a...\", \"timestamp\": \"\\\"2018-03-23T04:56:35Z\\\"\", \"url\": \"\\\"http://data.europa.eu/euodp/et/data/dataset/secto...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "eu": {"config_name": "eu", "sample_row": "{\"text\": \"\\\"Liverpool: The Beatles-en jaioterria eta ametsen ...\", \"timestamp\": \"\\\"2018-07-19T13:36:42Z\\\"\", \"url\": \"\\\"http://www.durangojesuitak.org/liverpool-the-beat...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fa": {"config_name": "fa", "sample_row": "{\"text\": \"\\\"\\\\u0642\\\\u06cc\\\\u0645\\\\u062a \\\\u062f\\\\u0648\\\\u0631\\\\u0628...\", \"timestamp\": \"\\\"2018-10-23T17:29:51Z\\\"\", \"url\": \"\\\"http://sib7.com/%D9%85%D8%B1%D8%A7%D9%82%D8%A8%D8...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fi": {"config_name": "fi", "sample_row": "{\"text\": \"\\\"Kontiolahti - Ihmisen pelastaminen - Kelkkailijat...\", \"timestamp\": \"\\\"2020-02-28T14:21:24Z\\\"\", \"url\": \"\\\"https://www.pkpelastuslaitos.fi/onnettomuustiedot...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fil": {"config_name": "fil", "sample_row": "{\"text\": \"\\\"\\\\ud83d\\\\ude00 Halimbawa ng thesis sa filipino tung...\", \"timestamp\": \"\\\"2019-04-19T17:14:36Z\\\"\", \"url\": \"\\\"http://talisman-intl.com/halimbawa-ng-thesis-sa-f...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fr": {"config_name": "fr", "sample_row": "{\"text\": \"\\\"Le sacre de philippe ier, 23 mai 1059 - Compte Re...\", \"timestamp\": \"\\\"2017-12-15T04:37:34Z\\\"\", \"url\": \"\\\"http://www.etudier.com/dissertations/Le-Sacre-De-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "fy": {"config_name": "fy", "sample_row": "{\"text\": \"\\\"Business Park Mas Blau II Place Pla de L\\\\u2019Est...\", \"timestamp\": \"\\\"2019-07-20T12:10:15Z\\\"\", \"url\": \"\\\"https://1worldirectory.com/28th-euro-global-neuro...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ga": {"config_name": "ga", "sample_row": "{\"text\": \"\\\"Smaointe F\\\\u00e1nacha Aonghusa: Comhl\\\\u00e1n\\\\u00f...\", \"timestamp\": \"\\\"2018-01-18T18:03:55Z\\\"\", \"url\": \"\\\"https://aonghus.blogspot.com/2014/08/comhlanu-le-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "gd": {"config_name": "gd", "sample_row": "{\"text\": \"\\\"Caol Reatha - Uicipeid\\\\nCo-chomharran: 57\\\\u00b013...\", \"timestamp\": \"\\\"2020-08-03T21:31:47Z\\\"\", \"url\": \"\\\"https://gd.m.wikipedia.org/wiki/Caol_Reatha\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "gl": {"config_name": "gl", "sample_row": "{\"text\": \"\\\"Niza (San Sebasti\\\\u00e1n - Donostia, Espa\\\\u00f1a)...\", \"timestamp\": \"\\\"2017-09-25T14:11:41Z\\\"\", \"url\": \"\\\"https://www.tripadvisor.es/VacationRentalReview-g...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "gu": {"config_name": "gu", "sample_row": "{\"text\": \"\\\"\\\\u0aee\\\\u0ae6% \\\\u0aa8\\\\u0abf\\\\u0a95\\\\u0abe\\\\u0ab8\\\\u0a9...\", \"timestamp\": \"\\\"2018-12-10T08:21:40Z\\\"\", \"url\": \"\\\"http://sandesh.com/80-exporters-gst-refund-7-m/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ha": {"config_name": "ha", "sample_row": "{\"text\": \"\\\"Ma\\\\u0257aukaki Matsala, Magana, Magani bayyana: M...\", \"timestamp\": \"\\\"2019-07-22T01:13:39Z\\\"\", \"url\": \"\\\"https://www.martinvrijland.nl/ha/nazarin-labarai/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "haw": {"config_name": "haw", "sample_row": "{\"text\": \"\\\"houses Kampala - Fashion - Fashion Accessories - ...\", \"timestamp\": \"\\\"2018-06-23T23:17:39Z\\\"\", \"url\": \"\\\"https://www.afribaba.ug/ads/houses+Kampala.htm?ci...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hi": {"config_name": "hi", "sample_row": "{\"text\": \"\\\"6 \\\\u0938\\\\u093e\\\\u0932 \\\\u0915\\\\u0940 \\\\u092c\\\\u091a\\\\u0...\", \"timestamp\": \"\\\"2018-12-15T16:31:15Z\\\"\", \"url\": \"\\\"http://www.upuklive.com/2018/11/6_20.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hi-Latn": {"config_name": "hi-Latn", "sample_row": "{\"text\": \"\\\"Total de visitas: 24089\\\\nHindi Book Free Download...\", \"timestamp\": \"\\\"2019-08-26T01:02:16Z\\\"\", \"url\": \"\\\"http://thylmotopbtovict.comunidades.net/hindi-boo...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hmn": {"config_name": "hmn", "sample_row": "{\"text\": \"\\\"Yuav Ua Li Cas Cov Nkag Tawm Ntawm Cov Pob Taws U...\", \"timestamp\": \"\\\"2020-04-02T03:42:06Z\\\"\", \"url\": \"\\\"https://hmn.phanthanhgianfoundation.com/how-to-re...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ht": {"config_name": "ht", "sample_row": "{\"text\": \"\\\"Gwo pouvwa 40W ki ap dirije lari lanp segond\\\\u00e...\", \"timestamp\": \"\\\"2020-06-06T17:00:07Z\\\"\", \"url\": \"\\\"https://www.ledlightinside.com/ht/sword-series-le...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hu": {"config_name": "hu", "sample_row": "{\"text\": \"\\\"Gy\\\\u00e1ri 5X112 7X17 ET54 57.1 HA1618 SEAT Gy\\\\u0...\", \"timestamp\": \"\\\"2020-07-06T09:13:31Z\\\"\", \"url\": \"\\\"https://weltgumi.hu/termek/imp_13_ha1618.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "hy": {"config_name": "hy", "sample_row": "{\"text\": \"\\\"\\\\u0556\\\\u0580\\\\u0561\\\\u0576\\\\u057d\\\\u056b\\\\u0561\\\\u0575\\\\...\", \"timestamp\": \"\\\"2020-06-06T20:17:50Z\\\"\", \"url\": \"\\\"https://www.1in.am/2729354.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "id": {"config_name": "id", "sample_row": "{\"text\": \"\\\"thapki full serial | Cinta Sinopsis\\\\nHome \\\\u00bb ...\", \"timestamp\": \"\\\"2017-12-11T17:34:24Z\\\"\", \"url\": \"\\\"http://cintasinopsis2.com/search/thapki-full-seri...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ig": {"config_name": "ig", "sample_row": "{\"text\": \"\\\"Hoka One One Arahi 3 Women's Allure/Mood Indigo [...\", \"timestamp\": \"\\\"2020-07-02T18:27:05Z\\\"\", \"url\": \"\\\"https://www.hoka-shoes.com/hoka-one-one-arahi-3-w...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "is": {"config_name": "is", "sample_row": "{\"text\": \"\\\"Omegle Ifugao. Besta val Omegle Ifugao. Inn og ha...\", \"timestamp\": \"\\\"2017-07-22T16:44:11Z\\\"\", \"url\": \"\\\"http://is.theomegle.com/filippseyjar/ifugao\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "it": {"config_name": "it", "sample_row": "{\"text\": \"\\\"Porcate Da Fare Con Il Partner Video Flirt Online...\", \"timestamp\": \"\\\"2017-08-18T23:57:08Z\\\"\", \"url\": \"\\\"http://gerebe.eu/porcate-da-fare-con-il-partner-v...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "iw": {"config_name": "iw", "sample_row": "{\"text\": \"\\\"\\\\u05d6\\\\u05db\\\\u05d5\\\\u05ea \\\\u05d4\\\\u05e9\\\\u05d1\\\\u05d9...\", \"timestamp\": \"\\\"2019-03-18T19:44:40Z\\\"\", \"url\": \"\\\"https://www.yeshiva.org.il/midrash/14790\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ja": {"config_name": "ja", "sample_row": "{\"text\": \"\\\"\\\\u751f\\\\u516b\\\\u3064\\\\u6a4b\\\\u306e\\\\u30bf\\\\u30b0\\\\u307e\\\\...\", \"timestamp\": \"\\\"2020-05-27T07:31:25Z\\\"\", \"url\": \"\\\"https://www.exblog.jp/tag/keyword/%E7%94%9F%E5%85...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ja-Latn": {"config_name": "ja-Latn", "sample_row": "{\"text\": \"\\\"Yuria Ashina - Pics & Movies Galleries - Teenax\\\\n...\", \"timestamp\": \"\\\"2017-09-25T10:18:12Z\\\"\", \"url\": \"\\\"http://www.teenax.com/free/pics-movies/yuria/ashi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "jv": {"config_name": "jv", "sample_row": "{\"text\": \"\\\"Parcel Baby Born | IklanBarisMassal.com | sebar i...\", \"timestamp\": \"\\\"2018-11-15T23:20:00Z\\\"\", \"url\": \"\\\"http://iklanbarismassal.iklanbaris.org/tag/parcel...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ka": {"config_name": "ka", "sample_row": "{\"text\": \"\\\"\\\\u10e0\\\\u10e3\\\\u10e1\\\\u10d4\\\\u10d7\\\\u10d8\\\\u10e1 \\\\u10de...\", \"timestamp\": \"\\\"2018-07-17T16:07:58Z\\\"\", \"url\": \"\\\"https://www.radiotavisupleba.ge/a/rusetis-presis-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "kk": {"config_name": "kk", "sample_row": "{\"text\": \"\\\"\\\\u0422\\\\u0430\\\\u049b\\\\u044b\\\\u0440\\\\u044b\\\\u043f: \\\\u041...\", \"timestamp\": \"\\\"2017-11-21T08:13:58Z\\\"\", \"url\": \"\\\"http://www.tarbie.kz/1205\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "km": {"config_name": "km", "sample_row": "{\"text\": \"\\\"\\\\u1794\\\\u17d2\\\\u179b\\\\u17c2\\\\u1780\\\\u17d7 \\\\u17d6\\\\u1796...\", \"timestamp\": \"\\\"2019-02-16T12:30:53Z\\\"\", \"url\": \"\\\"http://youfeed.net/archives/27\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "kn": {"config_name": "kn", "sample_row": "{\"text\": \"\\\"\\\\u0cb8\\\\u0ca6\\\\u0ccd\\\\u0caf\\\\u0ca6\\\\u0cb2\\\\u0ccd\\\\u0cb2\\\\...\", \"timestamp\": \"\\\"2020-07-15T18:51:35Z\\\"\", \"url\": \"\\\"https://kannada.goodreturns.in/news/new-one-rupee...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ko": {"config_name": "ko", "sample_row": "{\"text\": \"\\\"\\\\uc6c0\\\\uc9e4 - 19 \\\\uc774\\\\uc0c1\\\\ub9cc | \\\\ub2e4\\\\uc6...\", \"timestamp\": \"\\\"2020-07-13T03:51:37Z\\\"\", \"url\": \"\\\"https://leesangman.com/tag/%EC%9B%80%EC%A7%A4/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ku": {"config_name": "ku", "sample_row": "{\"text\": \"\\\"\\\\ufffd\\\\u06b3\\\\ufffd\\\\u0177\\\\ufffd\\\\ufffd\\\\ufffd\\\\ufffd\\\\...\", \"timestamp\": \"\\\"2016-10-27T19:37:29Z\\\"\", \"url\": \"\\\"http://dl.rakuten.co.jp/prod/800822116.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ky": {"config_name": "ky", "sample_row": "{\"text\": \"\\\"\\\\u042f\\\\u043b\\\\u0433\\\\u044b\\\\u0448\\\\u043b\\\\u0430\\\\u0440\\\\...\", \"timestamp\": \"\\\"2019-07-19T03:43:32Z\\\"\", \"url\": \"\\\"http://atnya-rt.ru/news/mgyiyat/ialgislarni-buldi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "la": {"config_name": "la", "sample_row": "{\"text\": \"\\\"OUDDORP - Huisartsenpraktijk Kop van 't Eiland | ...\", \"timestamp\": \"\\\"2019-08-18T07:52:35Z\\\"\", \"url\": \"\\\"https://www.vanderschootarchitecten.nl/projecten/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lb": {"config_name": "lb", "sample_row": "{\"text\": \"\\\"Truck Driving Jobs At Decker Truck Line | TruckDr...\", \"timestamp\": \"\\\"2019-11-21T09:17:59Z\\\"\", \"url\": \"\\\"https://truckdriverjobsingreatfallsmt.com/truck-d...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lo": {"config_name": "lo", "sample_row": "{\"text\": \"\\\"\\\\u0e99\\\\u0eb3\\\\u200b\\\\u0e9e\\\\u0ea3\\\\u0eb0\\\\u200b\\\\u0e81\\\\...\", \"timestamp\": \"\\\"2014-03-16T04:29:14Z\\\"\", \"url\": \"\\\"https://www.lds.org/general-conference/2013/04/th...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lt": {"config_name": "lt", "sample_row": "{\"text\": \"\\\"VALERIJA VILKAUSKIEN\\\\u0116\\\\nGid\\\\u0117 VALERIJA VI...\", \"timestamp\": \"\\\"2017-06-29T12:32:45Z\\\"\", \"url\": \"\\\"http://turizmokatalogas.lt/gidas/vilkauskiene-val...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "lv": {"config_name": "lv", "sample_row": "{\"text\": \"\\\"R\\\\u012bg\\\\u0101 2004. gada 26. mart\\\\u0101\\\\napstipr...\", \"timestamp\": \"\\\"2019-12-08T00:39:13Z\\\"\", \"url\": \"\\\"https://likumi.lv/doc.php?id=86335\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mg": {"config_name": "mg", "sample_row": "{\"text\": \"\\\"Find the best CPA or Tax Accountant in Papaaloa, ...\", \"timestamp\": \"\\\"2016-12-09T19:15:55Z\\\"\", \"url\": \"\\\"http://www.taxbuzz.com/find-the-best-tax-accounta...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mi": {"config_name": "mi", "sample_row": "{\"text\": \"\\\"\\\\u0411\\\\u0435\\\\u0442\\\\u043e\\\\u043d\\\\u0438, \\\\u0441\\\\u044...\", \"timestamp\": \"\\\"2017-07-20T16:43:12Z\\\"\", \"url\": \"\\\"https://ukrreferat.com/chapters/avtoref/betoni-st...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mk": {"config_name": "mk", "sample_row": "{\"text\": \"\\\"\\\\u0422\\\\u0420\\\\u0413\\\\u041d\\\\u0410\\\\u0410 \\\\u0421\\\\u041e...\", \"timestamp\": \"\\\"2018-11-20T14:56:13Z\\\"\", \"url\": \"\\\"http://sport.com.mk/megjunaroden-fudbal/uefa-evro...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ml": {"config_name": "ml", "sample_row": "{\"text\": \"\\\"\\\\u0d12\\\\u0d30\\\\u0d41 Ketogenic \\\\u0d21\\\\u0d2f\\\\u0d31\\\\u...\", \"timestamp\": \"\\\"2020-08-15T04:29:47Z\\\"\", \"url\": \"\\\"https://ml.elpasobackclinic.com/what-is-a-ketogen...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mn": {"config_name": "mn", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u043f\\\\u043e\\\\u0440\\\\u0442\\\\t25 \\\\u041d\\\\u043e\\\\u...\", \"timestamp\": \"\\\"2020-06-03T22:37:43Z\\\"\", \"url\": \"\\\"https://vtinform.com/news/145/150979/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mr": {"config_name": "mr", "sample_row": "{\"text\": \"\\\"uedbet\\\\u7b2c\\\\u4e94\\\\u5341\\\\u4e5d\\\\u7ae0 \\\\u5c01\\\\u4faf...\", \"timestamp\": \"\\\"2019-10-16T04:53:12Z\\\"\", \"url\": \"\\\"http://www.oybx.cn/ddk1597/1662712.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ms": {"config_name": "ms", "sample_row": "{\"text\": \"\\\"Suzana Mustafa: Bunga Ros Camellia\\\\nBunga Ros Cam...\", \"timestamp\": \"\\\"2018-09-19T21:23:16Z\\\"\", \"url\": \"\\\"http://diariann.blogspot.com/2012/05/bunga-ros-ca...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "mt": {"config_name": "mt", "sample_row": "{\"text\": \"\\\"Tastaturi \\\\u00een Iasi - OLX.ro\\\\nAnunturi Iasi - ...\", \"timestamp\": \"\\\"2017-12-16T15:20:24Z\\\"\", \"url\": \"\\\"https://www.olx.ro/iasi_39939/q-tastaturi/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "my": {"config_name": "my", "sample_row": "{\"text\": \"\\\"\\\\u1042\\\\u1040\\\\u1041\\\\u1040 \\\\u1001\\\\u102f\\\\u108f\\\\u103d...\", \"timestamp\": \"\\\"2018-08-19T11:14:40Z\\\"\", \"url\": \"\\\"http://thevoicemyanmar.com/about-us/18711-lkl\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ne": {"config_name": "ne", "sample_row": "{\"text\": \"\\\"\\\\u092a\\\\u094b\\\\u0930\\\\u094d\\\\u091a\\\\u0941\\\\u0917\\\\u0932\\\\...\", \"timestamp\": \"\\\"2019-04-22T00:04:13Z\\\"\", \"url\": \"\\\"http://vishwanews.com/Articles/view/4012\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "nl": {"config_name": "nl", "sample_row": "{\"text\": \"\\\"Vijf gouden tips voor succesvol zaken doen met Ja...\", \"timestamp\": \"\\\"2019-02-22T15:37:25Z\\\"\", \"url\": \"\\\"https://ondernemingen.bnpparibasfortis.be/nl/arti...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "no": {"config_name": "no", "sample_row": "{\"text\": \"\\\"Alf-tande petersen: - Jeg klarte ikke \\\\u00e5 beve...\", \"timestamp\": \"\\\"2020-06-03T22:36:11Z\\\"\", \"url\": \"\\\"https://www.seher.no/kjendis/jeg-klarte-ikke-a-be...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ny": {"config_name": "ny", "sample_row": "{\"text\": \"\\\"Date Latino Women In Ikawa, Shizuoka - Chat To La...\", \"timestamp\": \"\\\"2020-03-29T17:45:30Z\\\"\", \"url\": \"\\\"https://www.afroromance.com/members/Japan/Shizuok...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "pa": {"config_name": "pa", "sample_row": "{\"text\": \"\\\"\\\\u0a07\\\\u0a38\\\\u0a32\\\\u0a3e\\\\u0a2e\\\\u0a3e\\\\u0a2c\\\\u0a3e\\\\...\", \"timestamp\": \"\\\"2020-08-06T13:06:11Z\\\"\", \"url\": \"\\\"https://jagbani.punjabkesari.in/international/new...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "pl": {"config_name": "pl", "sample_row": "{\"text\": \"\\\"Author: Dorothy Celeste\\\\nISBN: 779-8-61280-301-9\\\\...\", \"timestamp\": \"\\\"2018-06-20T22:33:57Z\\\"\", \"url\": \"\\\"http://downloadallstuffs.club/best/gran-canaria-p...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ps": {"config_name": "ps", "sample_row": "{\"text\": \"\\\"\\\\u062f \\\\u0648\\\\u0644\\\\u0633\\\\u0645\\\\u0634\\\\u0631 \\\\u062...\", \"timestamp\": \"\\\"2018-06-25T12:00:13Z\\\"\", \"url\": \"\\\"https://kabull.com/%D8%AF-%D9%88%D9%84%D8%B3%D9%8...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "pt": {"config_name": "pt", "sample_row": "{\"text\": \"\\\"Nova atra\\\\u00e7\\\\u00e3o de corredeiras do Sea Worl...\", \"timestamp\": \"\\\"2017-12-12T02:42:12Z\\\"\", \"url\": \"\\\"http://malucasepiradas.com.br/orlando/infinityfal...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ro": {"config_name": "ro", "sample_row": "{\"text\": \"\\\"Download Darone Feat. Amanda Wilson - Believe In ...\", \"timestamp\": \"\\\"2017-01-22T18:13:35Z\\\"\", \"url\": \"\\\"http://www.muzicanet.net/descarca-romaneasca/Daro...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ru": {"config_name": "ru", "sample_row": "{\"text\": \"\\\"\\\\u2714\\\\ud83d\\\\udc4d\\\\ud83c\\\\udfff \\\\u041a\\\\u0443\\\\u043f...\", \"timestamp\": \"\\\"2017-09-23T16:16:52Z\\\"\", \"url\": \"\\\"https://needhack.ru/zakazat-organic-mask-v-novosi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ru-Latn": {"config_name": "ru-Latn", "sample_row": "{\"text\": \"\\\"Page 1977 of 3320.\\\\n39521 of 66398. 51471-Issledo...\", \"timestamp\": \"\\\"2017-08-20T23:15:02Z\\\"\", \"url\": \"\\\"http://writer5.ru/prompter/page1976.php\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sd": {"config_name": "sd", "sample_row": "{\"text\": \"\\\"\\\\u062f\\\\u0645\\\\u0627\\\\u0633\\\\u0646\\\\u062c \\\\u0648 \\\\u063...\", \"timestamp\": \"\\\"2019-09-20T12:23:42Z\\\"\", \"url\": \"\\\"https://rabinseh.com/%D8%AF%D9%85%D8%A7%D8%B3%D9%...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "si": {"config_name": "si", "sample_row": "{\"text\": \"\\\"\\\\u0d85\\\\u0db1\\\\u0dd4\\\\u0dc4\\\\u0dc3\\\\u0dca \\\\u0dbd\\\\u0db6...\", \"timestamp\": \"\\\"2018-06-22T17:19:49Z\\\"\", \"url\": \"\\\"http://tharunie.lk/component/k2/item/2907-%E0%B6%...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sk": {"config_name": "sk", "sample_row": "{\"text\": \"\\\"V\\\\u00fdsledok vyh\\\\u013ead\\\\u00e1vania pre \\\\u201e\\\\u...\", \"timestamp\": \"\\\"2020-02-25T17:24:35Z\\\"\", \"url\": \"\\\"https://ladasvetom.dennikn.sk/page/4/?s\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sl": {"config_name": "sl", "sample_row": "{\"text\": \"\\\"Zakon o pomorski in notranji plovbi /ZPNP/\\\\nZakon...\", \"timestamp\": \"\\\"2013-05-25T15:20:01Z\\\"\", \"url\": \"\\\"http://zakonodaja.gov.si/rpsi/r08/predpis_ZAKO121...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sm": {"config_name": "sm", "sample_row": "{\"text\": \"\\\"Samoa Observer | Manatu O Le Fa\\\\u2019atonu - O se...\", \"timestamp\": \"\\\"2020-08-10T16:08:09Z\\\"\", \"url\": \"\\\"https://www.samoaobserver.ws/category/article/244...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sn": {"config_name": "sn", "sample_row": "{\"text\": \"\\\"Heren Nike Air Max 95 Blauw Wit Schoenen Online,n...\", \"timestamp\": \"\\\"2017-12-14T12:58:56Z\\\"\", \"url\": \"\\\"http://www.pensacolamower.com/heren-nike-air-max-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "so": {"config_name": "so", "sample_row": "{\"text\": \"\\\"Kiiskii labaad ee fayruska coronavirus oo laga he...\", \"timestamp\": \"\\\"2020-03-31T19:09:40Z\\\"\", \"url\": \"\\\"http://puntlandmirror.net/kiiskii-labaad-ee-fayru...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sq": {"config_name": "sq", "sample_row": "{\"text\": \"\\\"Arritja e grupit t\\\\u00eb par\\\\u00eb t\\\\u00eb migran...\", \"timestamp\": \"\\\"2017-08-24T01:22:38Z\\\"\", \"url\": \"\\\"https://www.evropaelire.org/a/27652834.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sr": {"config_name": "sr", "sample_row": "{\"text\": \"\\\"\\\\ufeff \\\\u041d\\\\u0430\\\\u0442\\\\u0438\\\\u043e\\\\u043d\\\\u0430...\", \"timestamp\": \"\\\"2020-04-06T07:41:55Z\\\"\", \"url\": \"\\\"https://sr.time4invest.com/life-benefits-of-hikin...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "st": {"config_name": "st", "sample_row": "{\"text\": \"\\\"LES COMBATTANTS DE PARIS TOUJOURS EN FORCE POUR L...\", \"timestamp\": \"\\\"2018-03-24T04:14:21Z\\\"\", \"url\": \"\\\"http://drigombaki.skyrock.com/3038956371-LES-COMB...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "su": {"config_name": "su", "sample_row": "{\"text\": \"\\\"abditrass aplikator Oktober 10, 2019 New Google S...\", \"timestamp\": \"\\\"2019-10-24T05:40:49Z\\\"\", \"url\": \"\\\"http://www.abditrass.com/2019/10/jasa-geolistrik-...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sv": {"config_name": "sv", "sample_row": "{\"text\": \"\\\"Zara's Custom Tailor (Pattaya, Thailand) - omd\\\\u0...\", \"timestamp\": \"\\\"2018-11-14T05:15:44Z\\\"\", \"url\": \"\\\"https://www.tripadvisor.se/Attraction_Review-g293...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "sw": {"config_name": "sw", "sample_row": "{\"text\": \"\\\"2016 - 75 Miaka Meiringen Air Base - AviaSpotter....\", \"timestamp\": \"\\\"2019-10-16T05:11:38Z\\\"\", \"url\": \"\\\"https://www.aviaspotter.it/75-jahre-militarflugpl...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ta": {"config_name": "ta", "sample_row": "{\"text\": \"\\\"\\\\u0b95\\\\u0bc1\\\\u0bb4\\\\u0ba8\\\\u0bcd\\\\u0ba4\\\\u0bc8 \\\\u0baa...\", \"timestamp\": \"\\\"2020-07-06T20:32:10Z\\\"\", \"url\": \"\\\"http://www.thinakaran.lk/2019/11/16/%E0%AE%95%E0%...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "te": {"config_name": "te", "sample_row": "{\"text\": \"\\\"\\\\u0c2e\\\\u0c3f\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c4b\\\\u0c2e\\\\u0c3e\\\\...\", \"timestamp\": \"\\\"2018-11-17T19:38:19Z\\\"\", \"url\": \"\\\"https://www.pricedekho.com/te/tablets/micromax-fu...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "tg": {"config_name": "tg", "sample_row": "{\"text\": \"\\\"\\\\u0412\\\\u0430\\\\u0437\\\\u0438\\\\u0440\\\\u0438 \\\\u043a\\\\u043e...\", \"timestamp\": \"\\\"2019-11-13T12:05:58Z\\\"\", \"url\": \"\\\"https://www.ozodi.org/a/609049.html\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "th": {"config_name": "th", "sample_row": "{\"text\": \"\\\"\\\\u0e1d\\\\u0e32\\\\u0e01\\\\u0e40\\\\u0e07\\\\u0e34\\\\u0e19 \\\\u0e01...\", \"timestamp\": \"\\\"2019-06-26T03:50:41Z\\\"\", \"url\": \"\\\"http://luatthanhnien.com/%E0%B8%81%E0%B8%B2%E0%B8...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "tr": {"config_name": "tr", "sample_row": "{\"text\": \"\\\"Herhangi bir konuda \\\\u015feyhini aldatmamal\\\\u0131...\", \"timestamp\": \"\\\"2018-10-19T12:18:48Z\\\"\", \"url\": \"\\\"http://kalb-iselim.net/component/content/article/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "uk": {"config_name": "uk", "sample_row": "{\"text\": \"\\\"\\\\u042f\\\\u043a \\\\u043e\\\\u0431\\\\u043c\\\\u0435\\\\u0436\\\\u0438...\", \"timestamp\": \"\\\"2017-09-22T06:25:33Z\\\"\", \"url\": \"\\\"http://vidpoviday.com/yak-obmezhiti-shvidkist-int...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "und": {"config_name": "und", "sample_row": "{\"text\": \"\\\"Semi-Detached House for Sale - [40x70] 3200sqft 2...\", \"timestamp\": \"\\\"2019-12-09T20:49:49Z\\\"\", \"url\": \"\\\"http://www.hweeprop.com/25309733\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "ur": {"config_name": "ur", "sample_row": "{\"text\": \"\\\"\\\\u0645\\\\u0641\\\\u062a\\\\u06cc \\\\u0645\\\\u062d\\\\u0645\\\\u062f...\", \"timestamp\": \"\\\"2017-09-19T11:38:03Z\\\"\", \"url\": \"\\\"http://www.geourdu.com/mufti-mohammad-naeem-pakis...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "uz": {"config_name": "uz", "sample_row": "{\"text\": \"\\\"Q056 Al-Waqiah - Qaari Usman Birnin Kebbi | dawah...\", \"timestamp\": \"\\\"2020-02-21T16:40:29Z\\\"\", \"url\": \"\\\"https://dawahnigeria.com/dawahcast/l/140302\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "vi": {"config_name": "vi", "sample_row": "{\"text\": \"\\\"Th\\\\u1ee9 hai, 24/12/2018, 07:53 (GMT+7)\\\\nPh\\\\u1ea1...\", \"timestamp\": \"\\\"2020-07-15T00:59:05Z\\\"\", \"url\": \"\\\"https://ndh.vn/vi-mo/kinh-te-xa-hoi-2018-nhung-du...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "xh": {"config_name": "xh", "sample_row": "{\"text\": \"\\\"Iindlela Zakuqala Awayeshumayela Ngazo AmaNgqina ...\", \"timestamp\": \"\\\"2017-11-22T13:53:24Z\\\"\", \"url\": \"\\\"https://www.jw.org/xh/iimpapasho/iincwadi/Incwadi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "yi": {"config_name": "yi", "sample_row": "{\"text\": \"\\\"\\\\u05d2\\\\u05d5\\\\u05d8\\\\u05e7\\\\u05e1 VOGUE \\\\u05ea\\\\u05d7...\", \"timestamp\": \"\\\"2020-07-06T16:45:27Z\\\"\", \"url\": \"\\\"https://gottex.co.il/collections/gottex/products/...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "yo": {"config_name": "yo", "sample_row": "{\"text\": \"\\\"Linda Merrin | The Jewish Week\\\\nSearch this site:...\", \"timestamp\": \"\\\"2016-07-25T10:48:02Z\\\"\", \"url\": \"\\\"http://www.thejewishweek.com/category/person/lind...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "zh": {"config_name": "zh", "sample_row": "{\"text\": \"\\\"\\\\u9518\\\\ufffd \\\\u6d5c\\\\ufffd\\\\ufffd\\\\ufffd88\\\\u6d93\\\\uff...\", \"timestamp\": \"\\\"2020-01-29T07:21:51Z\\\"\", \"url\": \"\\\"https://311016.cn/safe/2019/1025/14480.htm\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "zh-Latn": {"config_name": "zh-Latn", "sample_row": "{\"text\": \"\\\"Search results for author \\\\\\\"Pan, Y.\\\\\\\"\\\\nSectoral a...\", \"timestamp\": \"\\\"2013-05-21T22:30:14Z\\\"\", \"url\": \"\\\"http://www.cifor.org/online-library/search/public...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}, "zu": {"config_name": "zu", "sample_row": "{\"text\": \"\\\"Battle Hardened \\\\u00b7 TheJournal.ie\\\\n#battle har...\", \"timestamp\": \"\\\"2019-08-18T18:56:56Z\\\"\", \"url\": \"\\\"https://www.thejournal.ie/battle-hardened/news/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI.\n", "dataset_name": "mc4"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:am", "language:ar", "language:az", "language:be", "language:bg", "language:bn", "language:ca", "language:ceb", "language:co", "language:cs", "language:cy", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fil", "language:fr", "language:fy", "language:ga", "language:gd", "language:gl", "language:gu", "language:ha", "language:haw", "language:he", "language:hi", "language:hmn", "language:ht", "language:hu", "language:hy", "language:id", "language:ig", "language:is", "language:it", "language:iw", "language:ja", "language:jv", "language:ka", "language:kk", "language:km", "language:kn", "language:ko", "language:ku", "language:ky", "language:la", "language:lb", "language:lo", "language:lt", "language:lv", "language:mg", "language:mi", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:my", "language:ne", "language:nl", "language:no", "language:ny", "language:pa", "language:pl", "language:ps", "language:pt", "language:ro", "language:ru", "language:sd", "language:si", "language:sk", "language:sl", "language:sm", "language:sn", "language:so", "language:sq", "language:sr", "language:st", "language:su", "language:sv", "language:sw", "language:ta", "language:te", "language:tg", "language:th", "language:tr", "language:uk", "language:und", "language:ur", "language:uz", "language:vi", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "med_hop": {"dataset_name": "med_hop", "description": "MedHop is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.", "downloads": 507, "configs": {"original": {"config_name": "original", "sample_row": "{\"id\": \"\\\"MH_train_0\\\"\", \"question\": \"\\\"interacts_with DB00773?\\\"\", \"answer\": \"\\\"DB00072\\\"\", \"candidates\": \"[\\\"DB00072\\\", \\\"DB00294\\\", \\\"DB00338\\\", \\\"DB00341\\\", \\\"DB00...\", \"supports\": \"[\\\"Induction of apoptosis of Beta cells of the panc...\"}", "columns": ["id", "question", "answer", "candidates", "supports"], "columns_mapping": {"id": "id", "question": "question", "answer": "answer", "candidates": "candidates", "supports": "supports"}, "dataset_description": "MedHop is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.\n", "dataset_name": "med_hop"}, "masked": {"config_name": "masked", "sample_row": "{\"id\": \"\\\"MH_train_0\\\"\", \"question\": \"\\\"interacts_with DB00773?\\\"\", \"answer\": \"\\\"___MASK51___\\\"\", \"candidates\": \"[\\\"___MASK10___\\\", \\\"___MASK16___\\\", \\\"___MASK2___\\\", \\\"_...\", \"supports\": \"[\\\"Induction of apoptosis of Beta cells of the panc...\"}", "columns": ["id", "question", "answer", "candidates", "supports"], "columns_mapping": {"id": "id", "question": "question", "answer": "answer", "candidates": "candidates", "supports": "supports"}, "dataset_description": "MedHop is based on research paper abstracts from PubMed, and the queries are about interactions between pairs of drugs. The correct answer has to be inferred by combining information from a chain of reactions of drugs and proteins.\n", "dataset_name": "med_hop"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "multi-hop"], "is_gated": false}, "medal": {"dataset_name": "medal", "description": "A large medical text dataset (14Go) curated to 4Go for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. For example, DHF can be disambiguated to dihydrofolate, diastolic heart failure, dengue hemorragic fever or dihydroxyfumarate", "downloads": 382, "configs": {"default": {"config_name": "default", "sample_row": "{\"abstract_id\": \"14145090\", \"text\": \"\\\"velvet antlers vas are commonly used in tradition...\", \"location\": \"[63]\", \"label\": \"[\\\"transverse aortic constriction\\\"]\"}", "columns": ["abstract_id", "text", "location", "label"], "columns_mapping": {"abstract_id": "abstract_id", "text": "text", "location": "location", "label": "label"}, "dataset_description": "A large medical text dataset (14Go) curated to 4Go for abbreviation disambiguation, designed for natural language understanding pre-training in the medical domain. For example, DHF can be disambiguated to dihydrofolate, diastolic heart failure, dengue hemorragic fever or dihydroxyfumarate\n", "dataset_name": "medal"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "disambiguation"], "is_gated": false}, "medical_questions_pairs": {"dataset_name": "medical_questions_pairs", "description": "This dataset consists of 3048 similar and dissimilar medical question pairs hand-generated and labeled by Curai's doctors.", "downloads": 5754, "configs": {"default": {"config_name": "default", "sample_row": "{\"dr_id\": \"1\", \"question_1\": \"\\\"After how many hour from drinking an antibiotic c...\", \"question_2\": \"\\\"I have a party tonight and I took my last dose of...\", \"label\": \"1\"}", "columns": ["dr_id", "question_1", "question_2", "label"], "columns_mapping": {"dr_id": "dr_id", "question_1": "question_1", "question_2": "question_2", "label": "label"}, "dataset_description": "This dataset consists of 3048 similar and dissimilar medical question pairs hand-generated and labeled by Curai's doctors.\n", "dataset_name": "medical_questions_pairs"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "menyo20k_mt": {"dataset_name": "menyo20k_mt", "description": "MENYO-20k is a multi-domain parallel dataset with texts obtained from news articles, ted talks, movie transcripts, radio transcripts, science and technology texts, and other short articles curated from the web and professional translators. The dataset has 20,100 parallel sentences split into 10,070 training sentences, 3,397 development sentences, and 6,633 test sentences (3,419 multi-domain, 1,714 news domain, and 1,500 ted talks speech transcript domain). The development and test sets are available upon request.", "downloads": 291, "configs": {"menyo20k_mt": {"config_name": "menyo20k_mt", "sample_row": "{\"translation.en\": \"\\\"Unit 1: What is Creative Commons?\\\"\", \"translation.yo\": \"\\\"\\\\ufeff\\\\u00ccd\\\\u00e1 1: K\\\\u00edn ni Creative Commo...\"}", "columns": ["translation_en", "translation_yo"], "columns_mapping": {"translation.en": "translation_en", "translation.yo": "translation_yo"}, "dataset_description": "MENYO-20k is a multi-domain parallel dataset with texts obtained from news articles, ted talks, movie transcripts, radio transcripts, science and technology texts, and other short articles curated from the web and professional translators. The dataset has 20,100 parallel sentences split into 10,070 training sentences, 3,397 development sentences, and 6,633 test sentences (3,419 multi-domain, 1,714 news domain, and 1,500 ted talks speech transcript domain). The development and test sets are available upon request.\n", "dataset_name": "menyo20k_mt"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:yo"], "is_gated": false}, "meta_woz": {"dataset_name": "meta_woz", "description": "MetaLWOz: A Dataset of Multi-Domain Dialogues for the Fast Adaptation of Conversation Models. We introduce the Meta-Learning Wizard of Oz (MetaLWOz) dialogue dataset for developing fast adaptation methods for conversation models. This data can be used to train task-oriented dialogue models, specifically to develop methods to quickly simulate user responses with a small amount of data. Such fast-adaptation models fall into the research areas of transfer learning and meta learning. The dataset consists of 37,884 crowdsourced dialogues recorded between two human users in a Wizard of Oz setup, in which one was instructed to behave like a bot, and the other a true human user. The users are assigned a task belonging to a particular domain, for example booking a reservation at a particular restaurant, and work together to complete the task. Our dataset spans 47 domains having 227 tasks total. Dialogues are a minimum of 10 turns long.", "downloads": 549, "configs": {"dialogues": {"config_name": "dialogues", "sample_row": "{\"id\": \"\\\"c399a493\\\"\", \"user_id\": \"\\\"c05f0462\\\"\", \"bot_id\": \"\\\"c96edf42\\\"\", \"domain\": \"\\\"AGREEMENT_BOT\\\"\", \"task_id\": \"\\\"a9203a2c\\\"\", \"turns\": \"[\\\"Hello how may I help you?\\\", \\\"i am awesome\\\", \\\"of ...\"}", "columns": ["id", "user_id", "bot_id", "domain", "task_id", "turns"], "columns_mapping": {"id": "id", "user_id": "user_id", "bot_id": "bot_id", "domain": "domain", "task_id": "task_id", "turns": "turns"}, "dataset_description": "MetaLWOz: A Dataset of Multi-Domain Dialogues for the Fast Adaptation of Conversation Models. We introduce the Meta-Learning Wizard of Oz (MetaLWOz) dialogue dataset for developing fast adaptation methods for conversation models. This data can be used to train task-oriented dialogue models, specifically to develop methods to quickly simulate user responses with a small amount of data. Such fast-adaptation models fall into the research areas of transfer learning and meta learning. The dataset consists of 37,884 crowdsourced dialogues recorded between two human users in a Wizard of Oz setup, in which one was instructed to behave like a bot, and the other a true human user. The users are assigned a task belonging to a particular domain, for example booking a reservation at a particular restaurant, and work together to complete the task. Our dataset spans 47 domains having 227 tasks total. Dialogues are a minimum of 10 turns long.\n", "dataset_name": "meta_woz"}, "tasks": {"config_name": "tasks", "sample_row": "{\"task_id\": \"\\\"4a06139e\\\"\", \"domain\": \"\\\"UPDATE_CALENDAR\\\"\", \"bot_prompt\": \"\\\"Schedule the user's meeting request\\\"\", \"bot_role\": \"\\\"You are a bot designed to help schedule meetings ...\", \"user_prompt\": \"\\\" You have a meeting saved for March 24th. Ask the...\", \"user_role\": \"\\\"You are interacting with a meeting scheduling bot...\"}", "columns": ["task_id", "domain", "bot_prompt", "bot_role", "user_prompt", "user_role"], "columns_mapping": {"task_id": "task_id", "domain": "domain", "bot_prompt": "bot_prompt", "bot_role": "bot_role", "user_prompt": "user_prompt", "user_role": "user_role"}, "dataset_description": "MetaLWOz: A Dataset of Multi-Domain Dialogues for the Fast Adaptation of Conversation Models. We introduce the Meta-Learning Wizard of Oz (MetaLWOz) dialogue dataset for developing fast adaptation methods for conversation models. This data can be used to train task-oriented dialogue models, specifically to develop methods to quickly simulate user responses with a small amount of data. Such fast-adaptation models fall into the research areas of transfer learning and meta learning. The dataset consists of 37,884 crowdsourced dialogues recorded between two human users in a Wizard of Oz setup, in which one was instructed to behave like a bot, and the other a true human user. The users are assigned a task belonging to a particular domain, for example booking a reservation at a particular restaurant, and work together to complete the task. Our dataset spans 47 domains having 227 tasks total. Dialogues are a minimum of 10 turns long.\n", "dataset_name": "meta_woz"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "metooma": {"dataset_name": "metooma", "description": "The dataset consists of tweets belonging to #MeToo movement on Twitter, labelled into different categories.\nDue to Twitter's development policies, we only provide the tweet ID's and corresponding labels,\nother data can be fetched via Twitter API.\nThe data has been labelled by experts, with the majority taken into the account for deciding the final label.\nWe provide these labels for each of the tweets. The labels provided for each data point\nincludes -- Relevance, Directed Hate, Generalized Hate,\nSarcasm, Allegation, Justification, Refutation, Support, Oppose", "downloads": 329, "configs": {"default": {"config_name": "default", "sample_row": "{\"TweetId\": \"\\\"1052237153789390853\\\"\", \"Text_Only_Informative\": \"1\", \"Image_Only_Informative\": \"1\", \"Directed_Hate\": \"0\", \"Generalized_Hate\": \"0\", \"Sarcasm\": \"0\", \"Allegation\": \"0\", \"Justification\": \"1\", \"Refutation\": \"0\", \"Support\": \"1\", \"Oppose\": \"0\"}", "columns": ["TweetId", "Text_Only_Informative", "Image_Only_Informative", "Directed_Hate", "Generalized_Hate", "Sarcasm", "Allegation", "Justification", "Refutation", "Support", "Oppose"], "columns_mapping": {"TweetId": "TweetId", "Text_Only_Informative": "Text_Only_Informative", "Image_Only_Informative": "Image_Only_Informative", "Directed_Hate": "Directed_Hate", "Generalized_Hate": "Generalized_Hate", "Sarcasm": "Sarcasm", "Allegation": "Allegation", "Justification": "Justification", "Refutation": "Refutation", "Support": "Support", "Oppose": "Oppose"}, "dataset_description": "The dataset consists of tweets belonging to #MeToo movement on Twitter, labelled into different categories.\nDue to Twitter's development policies, we only provide the tweet ID's and corresponding labels,\nother data can be fetched via Twitter API.\nThe data has been labelled by experts, with the majority taken into the account for deciding the final label.\nWe provide these labels for each of the tweets. The labels provided for each data point\nincludes -- Relevance, Directed Hate, Generalized Hate,\nSarcasm, Allegation, Justification, Refutation, Support, Oppose\n", "dataset_name": "metooma"}}, "tags": ["task_categories:text-classification", "task_categories:text-retrieval", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "miam": {"dataset_name": "miam", "description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.", "downloads": 1070, "configs": {"dihana": {"config_name": "dihana", "sample_row": "{\"Speaker\": \"\\\"M\\\"\", \"Utterance\": \"\\\"Bienvenido al servicio de informaci\\\\u00f3n de tre...\", \"Dialogue_Act\": \"\\\"Apertura\\\"\", \"Dialogue_ID\": \"\\\"1\\\"\", \"File_ID\": \"\\\"B209_BB2a0\\\"\", \"Label\": \"1\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "File_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "File_ID": "File_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "ilisten": {"config_name": "ilisten", "sample_row": "{\"Speaker\": \"\\\"S_29_S1\\\"\", \"Utterance\": \"\\\"Ciao, il mio nome e' Valentina. Sono qui per dart...\", \"Dialogue_Act\": \"\\\"OPENING\\\"\", \"Dialogue_ID\": \"\\\"0\\\"\", \"Label\": \"8\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "loria": {"config_name": "loria", "sample_row": "{\"Speaker\": \"\\\"Lucas\\\"\", \"Utterance\": \"\\\"Alors!\\\"\", \"Dialogue_Act\": \"\\\"greet\\\"\", \"Dialogue_ID\": \"\\\"0\\\"\", \"File_ID\": \"\\\"Dial_20110615_105040\\\"\", \"Label\": \"5\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "File_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "File_ID": "File_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "maptask": {"config_name": "maptask", "sample_row": "{\"Speaker\": \"\\\"g\\\"\", \"Utterance\": \"\\\"okay the start part is at the top left-hand corne...\", \"Dialogue_Act\": \"\\\"instruct\\\"\", \"Dialogue_ID\": \"\\\"0\\\"\", \"File_ID\": \"\\\"q7nc7\\\"\", \"Label\": \"5\", \"Idx\": \"0\"}", "columns": ["Speaker", "Utterance", "Dialogue_Act", "Dialogue_ID", "File_ID", "Label", "Idx"], "columns_mapping": {"Speaker": "Speaker", "Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Dialogue_ID": "Dialogue_ID", "File_ID": "File_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}, "vm2": {"config_name": "vm2", "sample_row": "{\"Utterance\": \"\\\"mein Name ist Keller $K $E Doppel-$L $E $R\\\"\", \"Dialogue_Act\": \"\\\"INTRODUCE\\\"\", \"Speaker\": \"\\\"A\\\"\", \"Dialogue_ID\": \"\\\"1\\\"\", \"Label\": \"19\", \"Idx\": \"0\"}", "columns": ["Utterance", "Dialogue_Act", "Speaker", "Dialogue_ID", "Label", "Idx"], "columns_mapping": {"Utterance": "Utterance", "Dialogue_Act": "Dialogue_Act", "Speaker": "Speaker", "Dialogue_ID": "Dialogue_ID", "Label": "Label", "Idx": "Idx"}, "dataset_description": "Multilingual dIalogAct benchMark is a collection of resources for training, evaluating, and\nanalyzing natural language understanding systems specifically designed for spoken language. Datasets\nare in English, French, German, Italian and Spanish. They cover a variety of domains including\nspontaneous speech, scripted scenarios, and joint task completion. Some datasets additionally include\nemotion and/or sentimant labels.\n", "dataset_name": "miam"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:text-classification", "task_ids:dialogue-modeling", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:it", "dialogue-act-classification"], "is_gated": false}, "mlsum": {"dataset_name": "mlsum", "description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.", "downloads": 2326, "configs": {"de": {"config_name": "de", "sample_row": "{\"text\": \"\\\"Transport im Viehwaggon, Fleischgeruch in der Luf...\", \"summary\": \"\\\"Transport im Viehwaggon, Fleischgeruch in der Luf...\", \"topic\": \"\\\"politik\\\"\", \"url\": \"\\\"https://www.sueddeutsche.de/politik/kz-auschwitz-...\", \"title\": \"\\\"So war Auschwitz: Erinnerungen einer Holocaust-\\\\u...\", \"date\": \"\\\"00/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "es": {"config_name": "es", "sample_row": "{\"text\": \"\\\"De momento, no podemos responder a la pregunta fr...\", \"summary\": \"\\\"Sofres no ofrece datos por ser festivo.- Telecinc...\", \"topic\": \"\\\"elpais actualidad\\\"\", \"url\": \"\\\"http://elpais.com/elpais/2010/01/01/actualidad/12...\", \"title\": \"\\\"\\\\u00bfQui\\\\u00e9n gan\\\\u00f3 en las campanadas?\\\"\", \"date\": \"\\\"01/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "fr": {"config_name": "fr", "sample_row": "{\"text\": \"\\\"Jean-Jacques Schuhl, Gilles Leroy, Christian Gail...\", \"summary\": \"\\\"Jean-Jacques Schuhl, Gilles Leroy, Christian Gail...\", \"topic\": \"\\\"livres\\\"\", \"url\": \"\\\"https://www.lemonde.fr/livres/article/2010/01/01/...\", \"title\": \"\\\"La rentr\\\\u00e9e litt\\\\u00e9raire promet un program...\", \"date\": \"\\\"01/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "ru": {"config_name": "ru", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u043b\\\\u0430\\\\u0434\\\\u043e\\\\u0441\\\\u0442\\\\u0440\\\\...\", \"summary\": \"\\\"\\\\u0421\\\\u0442\\\\u0430\\\\u0440\\\\u0448\\\\u0438\\\\u0439 \\\\u043f...\", \"topic\": \"\\\"incident\\\"\", \"url\": \"\\\"https://www.mk.ru/incident/article/2010/01/05/409...\", \"title\": \"\\\"\\\\u041f\\\\u0435\\\\u0434\\\\u043e\\\\u0444\\\\u0438\\\\u043b \\\\u043f...\", \"date\": \"\\\"06/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}, "tu": {"config_name": "tu", "sample_row": "{\"text\": \"\\\"Ara\\\\u00e7 sahipleri i\\\\u00e7in pahal\\\\u0131 benzine...\", \"summary\": \"\\\"Benzinin litresi 4 liraya yakla\\\\u015ft\\\\u0131. Asl...\", \"topic\": \"\\\"unknown\\\"\", \"url\": \"\\\"https://www.internethaber.com/aracinizda-yuzde-30...\", \"title\": \"\\\"Arac\\\\u0131n\\\\u0131zda y\\\\u00fczde 30 tarassuf edin\\\"...\", \"date\": \"\\\"00/01/2010\\\"\"}", "columns": ["text", "summary", "topic", "url", "title", "date"], "columns_mapping": {"text": "text", "summary": "summary", "topic": "topic", "url": "url", "title": "title", "date": "date"}, "dataset_description": "We present MLSUM, the first large-scale MultiLingual SUMmarization dataset.\nObtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.\nTogether with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.\nWe report cross-lingual comparative analyses based on state-of-the-art systems.\nThese highlight existing biases which motivate the use of a multi-lingual dataset.\n", "dataset_name": "mlsum"}}, "tags": ["task_categories:summarization", "task_categories:translation", "task_categories:text-classification", "task_ids:news-articles-summarization", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:extended|cnn_dailymail", "source_datasets:original", "language:de", "language:es", "language:fr", "language:ru", "language:tr"], "is_gated": false}, "moroco": {"dataset_name": "moroco", "description": "The MOROCO (Moldavian and Romanian Dialectal Corpus) dataset contains 33564 samples of text collected from the news domain.\nThe samples belong to one of the following six topics:\n - culture\n - finance\n - politics\n - science\n - sports\n - tech", "downloads": 298, "configs": {"moroco": {"config_name": "moroco", "sample_row": "{\"id\": \"\\\"48482\\\"\", \"category\": \"2\", \"sample\": \"\\\"\\\\u201c$NE$ cum am spus, nu este un sf\\\\u00e2r\\\\u015...\"}", "columns": ["id", "category", "sample"], "columns_mapping": {"id": "id", "category": "category", "sample": "sample"}, "dataset_description": "The MOROCO (Moldavian and Romanian Dialectal Corpus) dataset contains 33564 samples of text collected from the news domain.\nThe samples belong to one of the following six topics:\n - culture\n - finance\n - politics\n - science\n - sports\n - tech\n", "dataset_name": "moroco"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ro"], "is_gated": false}, "movie_rationales": {"dataset_name": "movie_rationales", "description": "The movie rationale dataset contains human annotated rationales for movie\nreviews.", "downloads": 809, "configs": {"default": {"config_name": "default", "sample_row": "{\"review\": \"\\\"plot : two teen couples go to a church party , dr...\", \"label\": \"0\", \"evidences\": \"[\\\"mind - fuck movie\\\", \\\"the sad part is\\\", \\\"downshif...\"}", "columns": ["review", "label", "evidences"], "columns_mapping": {"review": "review", "label": "label", "evidences": "evidences"}, "dataset_description": "\nThe movie rationale dataset contains human annotated rationales for movie\nreviews.\n", "dataset_name": "movie_rationales"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "mrqa": {"dataset_name": "mrqa", "description": "The MRQA 2019 Shared Task focuses on generalization in question answering.\nAn effective question answering system should do more than merely\ninterpolate from the training set to answer test examples drawn\nfrom the same distribution: it should also be able to extrapolate\nto out-of-distribution examples \u2014 a significantly harder challenge.\n\nThe dataset is a collection of 18 existing QA dataset (carefully selected\nsubset of them) and converted to the same format (SQuAD format). Among\nthese 18 datasets, six datasets were made available for training,\nsix datasets were made available for development, and the final six\nfor testing. The dataset is released as part of the MRQA 2019 Shared Task.", "downloads": 1286, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"subset\": \"\\\"SQuAD\\\"\", \"context\": \"\\\"Architecturally, the school has a Catholic charac...\", \"context_tokens.tokens\": \"[\\\"Architecturally\\\", \\\",\\\", \\\"the\\\", \\\"school\\\", \\\"has\\\", \\\"...\", \"context_tokens.offsets\": \"[0, 15, 17, 21, 28, 32, 34, 43, 52, 54, 59, 63, 68...\", \"qid\": \"\\\"38cc2597b6624bd8af1e8ba7f693096f\\\"\", \"question\": \"\\\"To whom did the Virgin Mary allegedly appear in 1...\", \"question_tokens.tokens\": \"[\\\"To\\\", \\\"whom\\\", \\\"did\\\", \\\"the\\\", \\\"Virgin\\\", \\\"Mary\\\", \\\"al...\", \"question_tokens.offsets\": \"[0, 3, 8, 12, 16, 23, 28, 38, 45, 48, 53, 56, 64, ...\", \"detected_answers.text\": \"[\\\"Saint Bernadette Soubirous\\\"]\", \"detected_answers.char_spans\": \"[{\\\"start\\\": [515], \\\"end\\\": [540]}]\", \"detected_answers.token_spans\": \"[{\\\"start\\\": [102], \\\"end\\\": [104]}]\", \"answers\": \"[\\\"Saint Bernadette Soubirous\\\"]\"}", "columns": ["subset", "context", "context_tokens_tokens", "context_tokens_offsets", "qid", "question", "question_tokens_tokens", "question_tokens_offsets", "detected_answers_text", "detected_answers_char_spans", "detected_answers_token_spans", "answers"], "columns_mapping": {"subset": "subset", "context": "context", "context_tokens.tokens": "context_tokens_tokens", "context_tokens.offsets": "context_tokens_offsets", "qid": "qid", "question": "question", "question_tokens.tokens": "question_tokens_tokens", "question_tokens.offsets": "question_tokens_offsets", "detected_answers.text": "detected_answers_text", "detected_answers.char_spans": "detected_answers_char_spans", "detected_answers.token_spans": "detected_answers_token_spans", "answers": "answers"}, "dataset_description": "The MRQA 2019 Shared Task focuses on generalization in question answering.\nAn effective question answering system should do more than merely\ninterpolate from the training set to answer test examples drawn\nfrom the same distribution: it should also be able to extrapolate\nto out-of-distribution examples \u2014 a significantly harder challenge.\n\nThe dataset is a collection of 18 existing QA dataset (carefully selected\nsubset of them) and converted to the same format (SQuAD format). Among\nthese 18 datasets, six datasets were made available for training,\nsix datasets were made available for development, and the final six\nfor testing. The dataset is released as part of the MRQA 2019 Shared Task.\n", "dataset_name": "mrqa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:extended|drop", "source_datasets:extended|hotpot_qa", "source_datasets:extended|natural_questions", "source_datasets:extended|race", "source_datasets:extended|search_qa", "source_datasets:extended|squad", "source_datasets:extended|trivia_qa", "language:en"], "is_gated": false}, "msr_sqa": {"dataset_name": "msr_sqa", "description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.", "downloads": 476, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"nt-639\\\"\", \"annotator\": \"0\", \"position\": \"0\", \"question\": \"\\\"where are the players from?\\\"\", \"question_and_history\": \"[\\\"where are the players from?\\\"]\", \"table_file\": \"\\\"table_csv/203_149.csv\\\"\", \"table_header\": \"[\\\"Pick\\\", \\\"Player\\\", \\\"Team\\\", \\\"Position\\\", \\\"School\\\"]...\", \"table_data\": \"[[\\\"1\\\", \\\"Ben McDonald\\\", \\\"Baltimore Orioles\\\", \\\"RHP\\\",...\", \"answer_coordinates.row_index\": \"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...\", \"answer_coordinates.column_index\": \"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...\", \"answer_text\": \"[\\\"Louisiana State University\\\", \\\"Valley HS (Las Veg...\"}", "columns": ["id", "annotator", "position", "question", "question_and_history", "table_file", "table_header", "table_data", "answer_coordinates_row_index", "answer_coordinates_column_index", "answer_text"], "columns_mapping": {"id": "id", "annotator": "annotator", "position": "position", "question": "question", "question_and_history": "question_and_history", "table_file": "table_file", "table_header": "table_header", "table_data": "table_data", "answer_coordinates.row_index": "answer_coordinates_row_index", "answer_coordinates.column_index": "answer_coordinates_column_index", "answer_text": "answer_text"}, "dataset_description": "Recent work in semantic parsing for question answering has focused on long and complicated questions, many of which would seem unnatural if asked in a normal conversation between two humans. In an effort to explore a conversational QA setting, we present a more realistic task: answering sequences of simple but inter-related questions. We created SQA by asking crowdsourced workers to decompose 2,022 questions from WikiTableQuestions (WTQ), which contains highly-compositional questions about tables from Wikipedia. We had three workers decompose each WTQ question, resulting in a dataset of 6,066 sequences that contain 17,553 questions in total. Each question is also associated with answers in the form of cell locations in the tables.\n", "dataset_name": "msr_sqa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "msra_ner": {"dataset_name": "msra_ner", "description": "The Third International Chinese Language\nProcessing Bakeoff was held in Spring\n2006 to assess the state of the art in two\nimportant tasks: word segmentation and\nnamed entity recognition. Twenty-nine\ngroups submitted result sets in the two\ntasks across two tracks and a total of five\ncorpora. We found strong results in both\ntasks as well as continuing challenges.\n\nMSRA NER is one of the provided dataset.\nThere are three types of NE, PER (person),\nORG (organization) and LOC (location).\nThe dataset is in the BIO scheme.\n\nFor more details see https://faculty.washington.edu/levow/papers/sighan06.pdf", "downloads": 636, "configs": {"msra_ner": {"config_name": "msra_ner", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u5f53\\\", \\\"\\\\u5e0c\\\", \\\"\\\\u671b\\\", \\\"\\\\u5de5\\\", \\\"\\\\u7a0b\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The Third International Chinese Language\nProcessing Bakeoff was held in Spring\n2006 to assess the state of the art in two\nimportant tasks: word segmentation and\nnamed entity recognition. Twenty-nine\ngroups submitted result sets in the two\ntasks across two tracks and a total of five\ncorpora. We found strong results in both\ntasks as well as continuing challenges.\n\nMSRA NER is one of the provided dataset.\nThere are three types of NE, PER (person),\nORG (organization) and LOC (location).\nThe dataset is in the BIO scheme.\n\nFor more details see https://faculty.washington.edu/levow/papers/sighan06.pdf\n", "dataset_name": "msra_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "multi_news": {"dataset_name": "multi_news", "description": "Multi-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.", "downloads": 4950, "configs": {"default": {"config_name": "default", "sample_row": "{\"document\": \"\\\"National Archives \\\\n \\\\n Yes, it\\\\u2019s that time ...\", \"summary\": \"\\\"\\\\u2013 The unemployment rate dropped to 8.2% last...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\nMulti-News, consists of news articles and human-written summaries\nof these articles from the site newser.com.\nEach summary is professionally written by editors and\nincludes links to the original articles cited.\n\nThere are two features:\n - document: text of news articles seperated by special token \"|||||\".\n - summary: news summary.\n", "dataset_name": "multi_news"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "multi_nli": {"dataset_name": "multi_nli", "description": "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a\ncrowd-sourced collection of 433k sentence pairs annotated with textual\nentailment information. The corpus is modeled on the SNLI corpus, but differs in\nthat covers a range of genres of spoken and written text, and supports a\ndistinctive cross-genre generalization evaluation. The corpus served as the\nbasis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.", "downloads": 8551, "configs": {"default": {"config_name": "default", "sample_row": "{\"promptID\": \"31193\", \"pairID\": \"\\\"31193n\\\"\", \"premise\": \"\\\"Conceptually cream skimming has two basic dimensi...\", \"premise_binary_parse\": \"\\\"( ( Conceptually ( cream skimming ) ) ( ( has ( (...\", \"premise_parse\": \"\\\"(ROOT (S (NP (JJ Conceptually) (NN cream) (NN ski...\", \"hypothesis\": \"\\\"Product and geography are what make cream skimmin...\", \"hypothesis_binary_parse\": \"\\\"( ( ( Product and ) geography ) ( ( are ( what ( ...\", \"hypothesis_parse\": \"\\\"(ROOT (S (NP (NN Product) (CC and) (NN geography)...\", \"genre\": \"\\\"government\\\"\", \"label\": \"1\"}", "columns": ["promptID", "pairID", "premise", "premise_binary_parse", "premise_parse", "hypothesis", "hypothesis_binary_parse", "hypothesis_parse", "genre", "label"], "columns_mapping": {"promptID": "promptID", "pairID": "pairID", "premise": "premise", "premise_binary_parse": "premise_binary_parse", "premise_parse": "premise_parse", "hypothesis": "hypothesis", "hypothesis_binary_parse": "hypothesis_binary_parse", "hypothesis_parse": "hypothesis_parse", "genre": "genre", "label": "label"}, "dataset_description": "The Multi-Genre Natural Language Inference (MultiNLI) corpus is a\ncrowd-sourced collection of 433k sentence pairs annotated with textual\nentailment information. The corpus is modeled on the SNLI corpus, but differs in\nthat covers a range of genres of spoken and written text, and supports a\ndistinctive cross-genre generalization evaluation. The corpus served as the\nbasis for the shared task of the RepEval 2017 Workshop at EMNLP in Copenhagen.\n", "dataset_name": "multi_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "multi_para_crawl": {"dataset_name": "multi_para_crawl", "description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.", "downloads": 848, "configs": {"cs-is": {"config_name": "cs-is", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"barva kv\\\\u011btina vinn\\\\u00fd, \\\\u0161e\\\\u0159\\\\u00e...\", \"translation.is\": \"\\\"bl\\\\u00f3m lit burgundy, lilac, bleikur, gr\\\\u00e6n...\"}", "columns": ["id", "translation_cs", "translation_is"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.is": "translation_is"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "ga-sk": {"config_name": "ga-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ga\": \"\\\"T\\\\u00e1 na deities go l\\\\u00e9ir ceangailte go bea...\", \"translation.sk\": \"\\\"V\\\\u0161etky bo\\\\u017estv\\\\u00e1 s\\\\u00fa s nimi neja...\"}", "columns": ["id", "translation_ga", "translation_sk"], "columns_mapping": {"id": "id", "translation.ga": "translation_ga", "translation.sk": "translation_sk"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "lv-mt": {"config_name": "lv-mt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.lv\": \"\\\"Pirmais satur izaicin\\\\u0101jumu Kor\\\\u0101na, kas ...\", \"translation.mt\": \"\\\"L-ewwel jinkludi l-isfida ta 'l-Koran li hija l-K...\"}", "columns": ["id", "translation_lv", "translation_mt"], "columns_mapping": {"id": "id", "translation.lv": "translation_lv", "translation.mt": "translation_mt"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "nb-ru": {"config_name": "nb-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.nb\": \"\\\"-gir beskyttelse mot handlingen av solstr\\\\u00e5li...\", \"translation.ru\": \"\\\"-\\\\u0434\\\\u0430\\\\u0435\\\\u0442 \\\\u0437\\\\u0430\\\\u0449\\\\u043...\"}", "columns": ["id", "translation_nb", "translation_ru"], "columns_mapping": {"id": "id", "translation.nb": "translation_nb", "translation.ru": "translation_ru"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}, "de-tl": {"config_name": "de-tl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Weil Polizei nicht Verst\\\\u00e4rkersysteme im Bere...\", \"translation.tl\": \"\\\"Dahil ang pulis hindi papayagan ang paglaki mga s...\"}", "columns": ["id", "translation_de", "translation_tl"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.tl": "translation_tl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project and further processed for making it a multi-parallel corpus by pivoting via English. Here we only provide the additional language pairs that came out of pivoting. The bitexts for English are available from the ParaCrawl release.\n40 languages, 669 bitexts\ntotal number of files: 40\ntotal number of tokens: 10.14G\ntotal number of sentence fragments: 505.48M\n\nPlease, acknowledge the ParaCrawl project at http://paracrawl.eu. This version is derived from the original release at their website adjusted for redistribution via the OPUS corpus collection. Please, acknowledge OPUS as well for this service.\n", "dataset_name": "multi_para_crawl"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:ca", "language:cs", "language:da", "language:de", "language:el", "language:es", "language:et", "language:eu", "language:fi", "language:fr", "language:ga", "language:gl", "language:ha", "language:hr", "language:hu", "language:ig", "language:is", "language:it", "language:km", "language:lt", "language:lv", "language:mt", "language:my", "language:nb", "language:ne", "language:nl", "language:nn", "language:pl", "language:ps", "language:pt", "language:ro", "language:ru", "language:si", "language:sk", "language:sl", "language:so", "language:sv", "language:sw", "language:tl"], "is_gated": false}, "multi_x_science_sum": {"dataset_name": "multi_x_science_sum", "description": "Multi-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references.", "downloads": 793, "configs": {"default": {"config_name": "default", "sample_row": "{\"aid\": \"\\\"math9912167\\\"\", \"mid\": \"\\\"1631980677\\\"\", \"abstract\": \"\\\"Author(s): Kuperberg, Greg; Thurston, Dylan P. | ...\", \"related_work\": \"\\\"Two other generalizations that can be considered ...\", \"ref_abstract.cite_N\": \"[\\\"@cite_16\\\", \\\"@cite_26\\\"]\", \"ref_abstract.mid\": \"[\\\"1481005306\\\", \\\"1641082372\\\"]\", \"ref_abstract.abstract\": \"[\\\"This note is a sequel to our earlier paper of th...\"}", "columns": ["aid", "mid", "abstract", "related_work", "ref_abstract_cite_N", "ref_abstract_mid", "ref_abstract_abstract"], "columns_mapping": {"aid": "aid", "mid": "mid", "abstract": "abstract", "related_work": "related_work", "ref_abstract.cite_N": "ref_abstract_cite_N", "ref_abstract.mid": "ref_abstract_mid", "ref_abstract.abstract": "ref_abstract_abstract"}, "dataset_description": "\nMulti-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references.\n", "dataset_name": "multi_x_science_sum"}}, "tags": ["task_categories:summarization", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "paper-abstract-generation"], "is_gated": false}, "multidoc2dial": {"dataset_name": "multidoc2dial", "description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents.", "downloads": 1007, "configs": {"dialogue_domain": {"config_name": "dialogue_domain", "sample_row": "{\"dial_id\": \"\\\"8df07b7a98990db27c395cb1f68a962e\\\"\", \"domain\": \"\\\"dmv\\\"\", \"turns\": \"[{\\\"da\\\": \\\"query_condition\\\", \\\"references\\\": [{\\\"label\\\"...\"}", "columns": ["dial_id", "domain", "turns"], "columns_mapping": {"dial_id": "dial_id", "domain": "domain", "turns": "turns"}, "dataset_description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents.\n", "dataset_name": "multidoc2dial"}, "document_domain": {"config_name": "document_domain", "sample_row": "{\"domain\": \"\\\"ssa\\\"\", \"doc_id\": \"\\\"Benefits Planner: Survivors | Planning For Your S...\", \"title\": \"\\\"Benefits Planner: Survivors | Planning For Your S...\", \"doc_text\": \"\\\"\\\\n\\\\nBenefits Planner: Survivors | Planning For Yo...\", \"spans\": \"[{\\\"id_sp\\\": \\\"1\\\", \\\"tag\\\": \\\"h2\\\", \\\"start_sp\\\": 0, \\\"end_s...\", \"doc_html_ts\": \"\\\"

\\\\nSubject: Alt....\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.graphics": {"config_name": "bydate_comp.graphics", "sample_row": "{\"text\": \"\\\"From: lipman@oasys.dt.navy.mil (Robert Lipman)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.os.ms-windows.misc": {"config_name": "bydate_comp.os.ms-windows.misc", "sample_row": "{\"text\": \"\\\"From: lipman@oasys.dt.navy.mil (Robert Lipman)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.sys.ibm.pc.hardware": {"config_name": "bydate_comp.sys.ibm.pc.hardware", "sample_row": "{\"text\": \"\\\"From: bobmon@cs.indiana.edu (Bob Montante)\\\\nSubje...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.sys.mac.hardware": {"config_name": "bydate_comp.sys.mac.hardware", "sample_row": "{\"text\": \"\\\"Subject: ** Need Advice ** (about Tech Works etc....\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_comp.windows.x": {"config_name": "bydate_comp.windows.x", "sample_row": "{\"text\": \"\\\"From: chongo@toad.com (Landon C. Noll)\\\\nSubject: ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_misc.forsale": {"config_name": "bydate_misc.forsale", "sample_row": "{\"text\": \"\\\"From: kedz@bigwpi.WPI.EDU (John Kedziora)\\\\nSubjec...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.autos": {"config_name": "bydate_rec.autos", "sample_row": "{\"text\": \"\\\"From: dennisk@cs.uoregon.edu (Dennis Kennedy)\\\\nSu...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.motorcycles": {"config_name": "bydate_rec.motorcycles", "sample_row": "{\"text\": \"\\\"From: ivan@erich.triumf.ca (Ivan D. Reid)\\\\nSubjec...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.sport.baseball": {"config_name": "bydate_rec.sport.baseball", "sample_row": "{\"text\": \"\\\"From: admiral@jhunix.hcf.jhu.edu (Steve C Liu)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_rec.sport.hockey": {"config_name": "bydate_rec.sport.hockey", "sample_row": "{\"text\": \"\\\"From: ayari@judikael.loria.fr (Ayari Iskander)\\\\nS...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.crypt": {"config_name": "bydate_sci.crypt", "sample_row": "{\"text\": \"\\\"From: Marc VanHeyningen ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.electronics": {"config_name": "bydate_sci.electronics", "sample_row": "{\"text\": \"\\\"From: keith@radio.nl.nuwc.navy.mil\\\\nSubject: Tekt...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.med": {"config_name": "bydate_sci.med", "sample_row": "{\"text\": \"\\\"From: bed@intacc.uucp (Deb Waddington)\\\\nSubject: ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_sci.space": {"config_name": "bydate_sci.space", "sample_row": "{\"text\": \"\\\"From: et@teal.csn.org (Eric H. Taylor)\\\\nSubject: ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_soc.religion.christian": {"config_name": "bydate_soc.religion.christian", "sample_row": "{\"text\": \"\\\"From: jenk@microsoft.com (Jen Kilmer)\\\\nSubject: R...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.politics.guns": {"config_name": "bydate_talk.politics.guns", "sample_row": "{\"text\": \"\\\"From: manes@magpie.linknet.com (Steve Manes)\\\\nSub...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.politics.mideast": {"config_name": "bydate_talk.politics.mideast", "sample_row": "{\"text\": \"\\\"From: sera@zuma.UUCP (Serdar Argic)\\\\nSubject: The...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.politics.misc": {"config_name": "bydate_talk.politics.misc", "sample_row": "{\"text\": \"\\\"From: mpye@vmsb.is.csupomona.edu\\\\nSubject: Re: Me...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}, "bydate_talk.religion.misc": {"config_name": "bydate_talk.religion.misc", "sample_row": "{\"text\": \"\\\"X-Mailer: TMail version 1.17R\\\\nFrom: \\\\\\\"D. C. Sess...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "\nThe 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across\n20 different newsgroups. The 20 newsgroups collection has become a popular data set for experiments in text applications of\nmachine learning techniques, such as text classification and text clustering.\n\nsorted by date into training(60%) and test(40%) sets, does not include cross-posts (duplicates) and does not include newsgroup-identifying headers (Xref, Newsgroups, Path, Followup-To, Date)", "dataset_name": "newsgroup"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "newsph": {"dataset_name": "newsph", "description": "Large-scale dataset of Filipino news articles. Sourced for the NewsPH-NLI Project (Cruz et al., 2020).", "downloads": 290, "configs": {"newsph": {"config_name": "newsph", "sample_row": "{\"text\": \"\\\"= Task force tutugisin ang suspek sa pagpatay ng ...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Large-scale dataset of Filipino news articles. Sourced for the NewsPH-NLI Project (Cruz et al., 2020).\n", "dataset_name": "newsph"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:fil", "language:tl"], "is_gated": false}, "newsph_nli": {"dataset_name": "newsph_nli", "description": "First benchmark dataset for sentence entailment in the low-resource Filipino language.\nConstructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,\nin 70-15-15 split for training, validation, and testing.", "downloads": 301, "configs": {"default": {"config_name": "default", "sample_row": "{\"premise\": \"\\\"\\\\\\\"Hindi ko ugali ang mamulitika; mas gusto kong t...\", \"hypothesis\": \"\\\"Ito ang dineklara ni Atty. Romulo Macalintal, abo...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "First benchmark dataset for sentence entailment in the low-resource Filipino language.\nConstructed through exploting the structure of news articles. Contains 600,000 premise-hypothesis pairs,\nin 70-15-15 split for training, validation, and testing.\n", "dataset_name": "newsph_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:tl"], "is_gated": false}, "newspop": {"dataset_name": "newspop", "description": "This is a large data set of news items and their respective social feedback on multiple platforms: Facebook, Google+ and LinkedIn.\nThe collected data relates to a period of 8 months, between November 2015 and July 2016, accounting for about 100,000 news items on four different topics: economy, microsoft, obama and palestine.\nThis data set is tailored for evaluative comparisons in predictive analytics tasks, although allowing for tasks in other research areas such as topic detection and tracking, sentiment analysis in short text, first story detection or news recommendation.", "downloads": 373, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"99248\", \"title\": \"\\\"Obama Lays Wreath at Arlington National Cemetery\\\"...\", \"headline\": \"\\\"Obama Lays Wreath at Arlington National Cemetery....\", \"source\": \"\\\"USA TODAY\\\"\", \"topic\": \"\\\"obama\\\"\", \"publish_date\": \"\\\"2002-04-02 00:00:00\\\"\", \"facebook\": \"-1\", \"google_plus\": \"-1\", \"linked_in\": \"-1\"}", "columns": ["id", "title", "headline", "source", "topic", "publish_date", "facebook", "google_plus", "linked_in"], "columns_mapping": {"id": "id", "title": "title", "headline": "headline", "source": "source", "topic": "topic", "publish_date": "publish_date", "facebook": "facebook", "google_plus": "google_plus", "linked_in": "linked_in"}, "dataset_description": "\nThis is a large data set of news items and their respective social feedback on multiple platforms: Facebook, Google+ and LinkedIn.\nThe collected data relates to a period of 8 months, between November 2015 and July 2016, accounting for about 100,000 news items on four different topics: economy, microsoft, obama and palestine.\nThis data set is tailored for evaluative comparisons in predictive analytics tasks, although allowing for tasks in other research areas such as topic detection and tracking, sentiment analysis in short text, first story detection or news recommendation.\n", "dataset_name": "newspop"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "social-media-shares-prediction"], "is_gated": false}, "nkjp-ner": {"dataset_name": "nkjp-ner", "description": "The NKJP-NER is based on a human-annotated part of National Corpus of Polish (NKJP). We extracted sentences with named entities of exactly one type. The task is to predict the type of the named entity.", "downloads": 288, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"- Widzi pani , a Blokowa wzi\\\\u0119\\\\u0142a i si\\\\u0...\", \"target\": \"1\"}", "columns": ["sentence", "target"], "columns_mapping": {"sentence": "sentence", "target": "target"}, "dataset_description": "The NKJP-NER is based on a human-annotated part of National Corpus of Polish (NKJP). We extracted sentences with named entities of exactly one type. The task is to predict the type of the named entity.\n", "dataset_name": "nkjp-ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "nli_tr": {"dataset_name": "nli_tr", "description": "\\\r\nThe Natural Language Inference in Turkish (NLI-TR) is a set of two large scale datasets that were obtained by translating the foundational NLI corpora (SNLI and MNLI) using Amazon Translate.", "downloads": 854, "configs": {"snli_tr": {"config_name": "snli_tr", "sample_row": "{\"idx\": \"0\", \"premise\": \"\\\"Attaki bir ki\\\\u015fi, bozuk bir u\\\\u00e7a\\\\u011f\\\\u0...\", \"hypothesis\": \"\\\"Bir ki\\\\u015fi at\\\\u0131n\\\\u0131 yar\\\\u0131\\\\u015fma i...\", \"label\": \"1\"}", "columns": ["idx", "premise", "hypothesis", "label"], "columns_mapping": {"idx": "idx", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The Natural Language Inference in Turkish (NLI-TR) is a set of two large scale datasets that were obtained by translating the foundational NLI corpora (SNLI and MNLI) using Amazon Translate.\n", "dataset_name": "nli_tr"}, "multinli_tr": {"config_name": "multinli_tr", "sample_row": "{\"idx\": \"0\", \"premise\": \"\\\"Kavramsal olarak krem kayma\\\\u011f\\\\u0131n\\\\u0131n i...\", \"hypothesis\": \"\\\"\\\\u00dcr\\\\u00fcn ve co\\\\u011frafya krem kayma\\\\u011f\\\\...\", \"label\": \"1\"}", "columns": ["idx", "premise", "hypothesis", "label"], "columns_mapping": {"idx": "idx", "premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "The Natural Language Inference in Turkish (NLI-TR) is a set of two large scale datasets that were obtained by translating the foundational NLI corpora (SNLI and MNLI) using Amazon Translate.\n", "dataset_name": "nli_tr"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|snli", "source_datasets:extended|multi_nli", "language:tr"], "is_gated": false}, "nlu_evaluation_data": {"dataset_name": "nlu_evaluation_data", "description": "Raw part of NLU Evaluation Data. It contains 25 715 non-empty examples (original dataset has 25716 examples) from 68 unique intents belonging to 18 scenarios.", "downloads": 418, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"wake me up at five am this week\\\"\", \"scenario\": \"\\\"alarm\\\"\", \"label\": \"2\"}", "columns": ["text", "scenario", "label"], "columns_mapping": {"text": "text", "scenario": "scenario", "label": "label"}, "dataset_description": "Raw part of NLU Evaluation Data. It contains 25 715 non-empty examples (original dataset has 25716 examples) from 68 unique intents belonging to 18 scenarios.\n", "dataset_name": "nlu_evaluation_data"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "nq_open": {"dataset_name": "nq_open", "description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.", "downloads": 16377, "configs": {"nq_open": {"config_name": "nq_open", "sample_row": "{\"question\": \"\\\"where did they film hot tub time machine\\\"\", \"answer\": \"[\\\"Fernie Alpine Resort\\\"]\"}", "columns": ["question", "answer"], "columns_mapping": {"question": "question", "answer": "answer"}, "dataset_description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "dataset_name": "nq_open"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|natural_questions", "language:en"], "is_gated": false}, "nsmc": {"dataset_name": "nsmc", "description": "This is a movie review dataset in the Korean language. Reviews were scraped from Naver movies. The dataset construction is based on the method noted in Large movie review dataset from Maas et al., 2011.", "downloads": 2815, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"9976970\\\"\", \"document\": \"\\\"\\\\uc544 \\\\ub354\\\\ube59.. \\\\uc9c4\\\\uc9dc \\\\uc9dc\\\\uc99d\\\\u...\", \"label\": \"0\"}", "columns": ["id", "document", "label"], "columns_mapping": {"id": "id", "document": "document", "label": "label"}, "dataset_description": "This is a movie review dataset in the Korean language. Reviews were scraped from Naver movies. The dataset construction is based on the method noted in Large movie review dataset from Maas et al., 2011.\n", "dataset_name": "nsmc"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ko"], "is_gated": false}, "numer_sense": {"dataset_name": "numer_sense", "description": "NumerSense is a new numerical commonsense reasoning probing task, with a diagnostic dataset consisting of 3,145 masked-word-prediction probes.\n\nWe propose to study whether numerical commonsense knowledge can be induced from pre-trained language models like BERT, and to what extent this access to knowledge robust against adversarial examples is. We hope this will be beneficial for tasks such as knowledge base completion and open-domain question answering.", "downloads": 449, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"Some plant varieties can grow up to feet t...\", \"target\": \"\\\"nine\\\"\"}", "columns": ["sentence", "target"], "columns_mapping": {"sentence": "sentence", "target": "target"}, "dataset_description": "NumerSense is a new numerical commonsense reasoning probing task, with a diagnostic dataset consisting of 3,145 masked-word-prediction probes.\n\nWe propose to study whether numerical commonsense knowledge can be induced from pre-trained language models like BERT, and to what extent this access to knowledge robust against adversarial examples is. We hope this will be beneficial for tasks such as knowledge base completion and open-domain question answering.\n", "dataset_name": "numer_sense"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:slot-filling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other", "language:en"], "is_gated": false}, "numeric_fused_head": {"dataset_name": "numeric_fused_head", "description": "Fused Head constructions are noun phrases in which the head noun is missing and is said to be \"fused\" with its dependent modifier. This missing information is implicit and is important for sentence understanding.The missing heads are easily filled in by humans, but pose a challenge for computational models.\n\nFor example, in the sentence: \"I bought 5 apples but got only 4.\", 4 is a Fused-Head, and the missing head is apples, which appear earlier in the sentence.\n\nThis is a crowd-sourced dataset of 10k numerical fused head examples (1M tokens).", "downloads": 431, "configs": {"identification": {"config_name": "identification", "sample_row": "{\"tokens\": \"[\\\"There\\\", \\\"is\\\", \\\"3500\\\", \\\"...\\\", \\\"and\\\", \\\"since\\\", \\\"yo...\", \"start_index\": \"2\", \"end_index\": \"3\", \"label\": \"1\"}", "columns": ["tokens", "start_index", "end_index", "label"], "columns_mapping": {"tokens": "tokens", "start_index": "start_index", "end_index": "end_index", "label": "label"}, "dataset_description": "Fused Head constructions are noun phrases in which the head noun is missing and is said to be \"fused\" with its dependent modifier. This missing information is implicit and is important for sentence understanding.The missing heads are easily filled in by humans, but pose a challenge for computational models.\n\nFor example, in the sentence: \"I bought 5 apples but got only 4.\", 4 is a Fused-Head, and the missing head is apples, which appear earlier in the sentence.\n\nThis is a crowd-sourced dataset of 10k numerical fused head examples (1M tokens).\n", "dataset_name": "numeric_fused_head"}, "resolution": {"config_name": "resolution", "sample_row": "{\"tokens\": \"[\\\"What\\\", \\\"the\\\", \\\"fuck\\\", \\\"are\\\", \\\"you\\\", \\\"doing\\\", \\\"?\\\"...\", \"line_indices\": \"[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3...\", \"head\": \"[\\\"AGE\\\"]\", \"speakers\": \"[\\\"Stuart Alan Jones\\\", \\\"Stuart Alan Jones\\\", \\\"Stuart...\", \"anchors_indices\": \"[12]\"}", "columns": ["tokens", "line_indices", "head", "speakers", "anchors_indices"], "columns_mapping": {"tokens": "tokens", "line_indices": "line_indices", "head": "head", "speakers": "speakers", "anchors_indices": "anchors_indices"}, "dataset_description": "Fused Head constructions are noun phrases in which the head noun is missing and is said to be \"fused\" with its dependent modifier. This missing information is implicit and is important for sentence understanding.The missing heads are easily filled in by humans, but pose a challenge for computational models.\n\nFor example, in the sentence: \"I bought 5 apples but got only 4.\", 4 is a Fused-Head, and the missing head is apples, which appear earlier in the sentence.\n\nThis is a crowd-sourced dataset of 10k numerical fused head examples (1M tokens).\n", "dataset_name": "numeric_fused_head"}}, "tags": ["task_categories:token-classification", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "fused-head-identification"], "is_gated": false}, "oclar": {"dataset_name": "oclar", "description": "The researchers of OCLAR Marwan et al. (2019), they gathered Arabic costumer reviews from Google reviewsa and Zomato\nwebsite (https://www.zomato.com/lebanon) on wide scope of domain, including restaurants, hotels, hospitals, local shops,\netc.The corpus finally contains 3916 reviews in 5-rating scale. For this research purpose, the positive class considers\nrating stars from 5 to 3 of 3465 reviews, and the negative class is represented from values of 1 and 2 of about\n451 texts.", "downloads": 291, "configs": {"default": {"config_name": "default", "sample_row": "{\"pagename\": \"\\\"Beirut Golden Plaza Suites\\\"\", \"review\": \"\\\"\\\\u0647\\\\u0630\\\\u0627 \\\\u0627\\\\u0644\\\\u0641\\\\u0646\\\\u062f...\", \"rating\": \"2\"}", "columns": ["pagename", "review", "rating"], "columns_mapping": {"pagename": "pagename", "review": "review", "rating": "rating"}, "dataset_description": "The researchers of OCLAR Marwan et al. (2019), they gathered Arabic costumer reviews from Google reviewsa and Zomato\nwebsite (https://www.zomato.com/lebanon) on wide scope of domain, including restaurants, hotels, hospitals, local shops,\netc.The corpus finally contains 3916 reviews in 5-rating scale. For this research purpose, the positive class considers\nrating stars from 5 to 3 of 3465 reviews, and the negative class is represented from values of 1 and 2 of about\n451 texts.\n", "dataset_name": "oclar"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:sentiment-classification", "task_ids:sentiment-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "offenseval2020_tr": {"dataset_name": "offenseval2020_tr", "description": "OffensEval-TR 2020 is a Turkish offensive language corpus. The corpus consist of randomly sampled tweets and annotated in a similar way to OffensEval and GermEval.", "downloads": 302, "configs": {"offenseval2020-turkish": {"config_name": "offenseval2020-turkish", "sample_row": "{\"id\": \"20948\", \"tweet\": \"\\\"@USER en g\\\\u00fczel uyuyan insan \\\\u00f6d\\\\u00fcl\\\\u...\", \"subtask_a\": \"0\"}", "columns": ["id", "tweet", "subtask_a"], "columns_mapping": {"id": "id", "tweet": "tweet", "subtask_a": "subtask_a"}, "dataset_description": "OffensEval-TR 2020 is a Turkish offensive language corpus. The corpus consist of randomly sampled tweets and annotated in a similar way to OffensEval and GermEval.\n", "dataset_name": "offenseval2020_tr"}}, "tags": ["task_categories:text-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:tr", "offensive-language-classification"], "is_gated": false}, "ofis_publik": {"dataset_name": "ofis_publik", "description": "Texts from the Ofis Publik ar Brezhoneg (Breton Language Board) provided by Francis Tyers\n2 languages, total number of files: 278\ntotal number of tokens: 2.12M\ntotal number of sentence fragments: 0.13M", "downloads": 288, "configs": {"br-fr": {"config_name": "br-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Ema\\\\u00f1 Rannvro Breizh hag he c\\\\u2019hevelerien...\", \"translation.fr\": \"\\\"La R\\\\u00e9gion Bretagne et ses partenaires se pr\\\\...\"}", "columns": ["id", "translation_br", "translation_fr"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.fr": "translation_fr"}, "dataset_description": "Texts from the Ofis Publik ar Brezhoneg (Breton Language Board) provided by Francis Tyers\n2 languages, total number of files: 278\ntotal number of tokens: 2.12M\ntotal number of sentence fragments: 0.13M\n", "dataset_name": "ofis_publik"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:br", "language:fr"], "is_gated": false}, "onestop_qa": {"dataset_name": "onestop_qa", "description": "OneStopQA is a multiple choice reading comprehension dataset annotated according to the STARC (Structured Annotations for Reading Comprehension) scheme. The reading materials are Guardian articles taken from the [OneStopEnglish corpus](https://github.com/nishkalavallabhi/OneStopEnglishCorpus). Each article comes in three difficulty levels, Elementary, Intermediate and Advanced. Each paragraph is annotated with three multiple choice reading comprehension questions. The reading comprehension questions can be answered based on any of the three paragraph levels.", "downloads": 295, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"101-Year-Old Bottle Message\\\"\", \"paragraph\": \"\\\"Angela Erdmann never knew her grandfather. He die...\", \"level\": \"0\", \"question\": \"\\\"Who threw the bottle into the Baltic Sea?\\\"\", \"paragraph_index\": \"0\", \"answers\": \"[\\\"Angela Erdmann\\\\u2019s grandfather\\\", \\\"Angela Erdm...\", \"a_span\": \"[0, 45]\", \"d_span\": \"[63, 63]\"}", "columns": ["title", "paragraph", "level", "question", "paragraph_index", "answers", "a_span", "d_span"], "columns_mapping": {"title": "title", "paragraph": "paragraph", "level": "level", "question": "question", "paragraph_index": "paragraph_index", "answers": "answers", "a_span": "a_span", "d_span": "d_span"}, "dataset_description": "OneStopQA is a multiple choice reading comprehension dataset annotated according to the STARC (Structured Annotations for Reading Comprehension) scheme. The reading materials are Guardian articles taken from the [OneStopEnglish corpus](https://github.com/nishkalavallabhi/OneStopEnglishCorpus). Each article comes in three difficulty levels, Elementary, Intermediate and Advanced. Each paragraph is annotated with three multiple choice reading comprehension questions. The reading comprehension questions can be answered based on any of the three paragraph levels.\n", "dataset_name": "onestop_qa"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "source_datasets:extended|onestop_english", "language:en"], "is_gated": false}, "open_subtitles": {"dataset_name": "open_subtitles", "description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G", "downloads": 3487, "configs": {"bs-eo": {"config_name": "bs-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1973\", \"meta.imdbId\": \"70215\", \"meta.subtitleId.bs\": \"6080330\", \"meta.subtitleId.eo\": \"4010963\", \"meta.sentenceIds.bs\": \"[1]\", \"meta.sentenceIds.eo\": \"[2]\", \"translation.bs\": \"\\\"Gospodine Borgard...\\\"\", \"translation.eo\": \"\\\"Alvenis la respondo por vi el Nov-Orleano.\\\"\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_bs", "meta_subtitleId_eo", "meta_sentenceIds_bs", "meta_sentenceIds_eo", "translation_bs", "translation_eo"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.bs": "meta_subtitleId_bs", "meta.subtitleId.eo": "meta_subtitleId_eo", "meta.sentenceIds.bs": "meta_sentenceIds_bs", "meta.sentenceIds.eo": "meta_sentenceIds_eo", "translation.bs": "translation_bs", "translation.eo": "translation_eo"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "fr-hy": {"config_name": "fr-hy", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1971\", \"meta.imdbId\": \"67372\", \"meta.subtitleId.fr\": \"3693493\", \"meta.subtitleId.hy\": \"6711716\", \"meta.sentenceIds.fr\": \"[1]\", \"meta.sentenceIds.hy\": \"[1]\", \"translation.fr\": \"\\\"A quand rendez-vous prochain ?\\\"\", \"translation.hy\": \"\\\"\\\\u054e\\\\u0565\\\\u0570\\\\u0568 \\\\u0566\\\\u0561\\\\u0566\\\\u0580...\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_fr", "meta_subtitleId_hy", "meta_sentenceIds_fr", "meta_sentenceIds_hy", "translation_fr", "translation_hy"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.fr": "meta_subtitleId_fr", "meta.subtitleId.hy": "meta_subtitleId_hy", "meta.sentenceIds.fr": "meta_sentenceIds_fr", "meta.sentenceIds.hy": "meta_sentenceIds_hy", "translation.fr": "translation_fr", "translation.hy": "translation_hy"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "da-ru": {"config_name": "da-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1927\", \"meta.imdbId\": \"17136\", \"meta.subtitleId.da\": \"61728\", \"meta.subtitleId.ru\": \"42690\", \"meta.sentenceIds.da\": \"[1, 2]\", \"meta.sentenceIds.ru\": \"[1]\", \"translation.da\": \"\\\"Hver epoke skaber sin efterf\\\\u00f8lger - Jules Mi...\", \"translation.ru\": \"\\\"\\\\u041a\\\\u0430\\\\u0436\\\\u0434\\\\u0430\\\\u044f \\\\u044d\\\\u043f...\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_da", "meta_subtitleId_ru", "meta_sentenceIds_da", "meta_sentenceIds_ru", "translation_da", "translation_ru"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.da": "meta_subtitleId_da", "meta.subtitleId.ru": "meta_subtitleId_ru", "meta.sentenceIds.da": "meta_sentenceIds_da", "meta.sentenceIds.ru": "meta_sentenceIds_ru", "translation.da": "translation_da", "translation.ru": "translation_ru"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "en-hi": {"config_name": "en-hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1948\", \"meta.imdbId\": \"40522\", \"meta.subtitleId.en\": \"4180294\", \"meta.subtitleId.hi\": \"4239106\", \"meta.sentenceIds.en\": \"[1]\", \"meta.sentenceIds.hi\": \"[1]\", \"translation.en\": \"\\\"THE BICYCLE THIEF\\\"\", \"translation.hi\": \"\\\"\\\\u0938\\\\u093e\\\\u0907\\\\u0915\\\\u093f\\\\u0932 \\\\u091a\\\\u094b...\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_en", "meta_subtitleId_hi", "meta_sentenceIds_en", "meta_sentenceIds_hi", "translation_en", "translation_hi"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.en": "meta_subtitleId_en", "meta.subtitleId.hi": "meta_subtitleId_hi", "meta.sentenceIds.en": "meta_sentenceIds_en", "meta.sentenceIds.hi": "meta_sentenceIds_hi", "translation.en": "translation_en", "translation.hi": "translation_hi"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}, "bn-is": {"config_name": "bn-is", "sample_row": "{\"id\": \"\\\"0\\\"\", \"meta.year\": \"1981\", \"meta.imdbId\": \"82971\", \"meta.subtitleId.bn\": \"6443778\", \"meta.subtitleId.is\": \"4634729\", \"meta.sentenceIds.bn\": \"[2]\", \"meta.sentenceIds.is\": \"[2]\", \"translation.bn\": \"\\\"\\\\u09b9\\\\u09ac\\\\u09bf\\\\u099f\\\\u09cb\\\\u09b8 \\\\u0995\\\\u09be...\", \"translation.is\": \"\\\"Eitri\\\\u00f0 er enn \\\\u00f6flugt.\\\"\"}", "columns": ["id", "meta_year", "meta_imdbId", "meta_subtitleId_bn", "meta_subtitleId_is", "meta_sentenceIds_bn", "meta_sentenceIds_is", "translation_bn", "translation_is"], "columns_mapping": {"id": "id", "meta.year": "meta_year", "meta.imdbId": "meta_imdbId", "meta.subtitleId.bn": "meta_subtitleId_bn", "meta.subtitleId.is": "meta_subtitleId_is", "meta.sentenceIds.bn": "meta_sentenceIds_bn", "meta.sentenceIds.is": "meta_sentenceIds_is", "translation.bn": "translation_bn", "translation.is": "translation_is"}, "dataset_description": "This is a new collection of translated movie subtitles from http://www.opensubtitles.org/.\n\nIMPORTANT: If you use the OpenSubtitle corpus: Please, add a link to http://www.opensubtitles.org/ to your website and to your reports and publications produced with the data!\n\nThis is a slightly cleaner version of the subtitle collection using improved sentence alignment and better language checking.\n\n62 languages, 1,782 bitexts\ntotal number of files: 3,735,070\ntotal number of tokens: 22.10G\ntotal number of sentence fragments: 3.35G\n", "dataset_name": "open_subtitles"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:ar", "language:bg", "language:bn", "language:br", "language:bs", "language:ca", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fr", "language:gl", "language:he", "language:hi", "language:hr", "language:hu", "language:hy", "language:id", "language:is", "language:it", "language:ja", "language:ka", "language:kk", "language:ko", "language:lt", "language:lv", "language:mk", "language:ml", "language:ms", "language:nl", "language:no", "language:pl", "language:pt", "language:ro", "language:ru", "language:si", "language:sk", "language:sl", "language:sq", "language:sr", "language:sv", "language:ta", "language:te", "language:th", "language:tl", "language:tr", "language:uk", "language:ur", "language:vi", "language:zh"], "is_gated": false}, "openbookqa": {"dataset_name": "openbookqa", "description": "OpenBookQA aims to promote research in advanced question-answering, probing a deeper understanding of both the topic\n(with salient facts summarized as an open book, also provided with the dataset) and the language it is expressed in. In\nparticular, it contains questions that require multi-step reasoning, use of additional common and commonsense knowledge,\nand rich text comprehension.\nOpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding\nof a subject.", "downloads": 106856, "configs": {"main": {"config_name": "main", "sample_row": "{\"id\": \"\\\"7-980\\\"\", \"question_stem\": \"\\\"The sun is responsible for\\\"\", \"choices.text\": \"[\\\"puppies learning new tricks\\\", \\\"children growing ...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"D\\\"\"}", "columns": ["id", "question_stem", "choices_text", "choices_label", "answerKey"], "columns_mapping": {"id": "id", "question_stem": "question_stem", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey"}, "dataset_description": "OpenBookQA aims to promote research in advanced question-answering, probing a deeper understanding of both the topic\n(with salient facts summarized as an open book, also provided with the dataset) and the language it is expressed in. In\nparticular, it contains questions that require multi-step reasoning, use of additional common and commonsense knowledge,\nand rich text comprehension.\nOpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding\nof a subject.\n", "dataset_name": "openbookqa"}, "additional": {"config_name": "additional", "sample_row": "{\"id\": \"\\\"7-980\\\"\", \"question_stem\": \"\\\"The sun is responsible for\\\"\", \"choices.text\": \"[\\\"puppies learning new tricks\\\", \\\"children growing ...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"D\\\"\", \"fact1\": \"\\\"the sun is the source of energy for physical cycl...\", \"humanScore\": \"1.0\", \"clarity\": \"2.0\", \"turkIdAnonymized\": \"\\\"b356d338b7\\\"\"}", "columns": ["id", "question_stem", "choices_text", "choices_label", "answerKey", "fact1", "humanScore", "clarity", "turkIdAnonymized"], "columns_mapping": {"id": "id", "question_stem": "question_stem", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey", "fact1": "fact1", "humanScore": "humanScore", "clarity": "clarity", "turkIdAnonymized": "turkIdAnonymized"}, "dataset_description": "OpenBookQA aims to promote research in advanced question-answering, probing a deeper understanding of both the topic\n(with salient facts summarized as an open book, also provided with the dataset) and the language it is expressed in. In\nparticular, it contains questions that require multi-step reasoning, use of additional common and commonsense knowledge,\nand rich text comprehension.\nOpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding\nof a subject.\n", "dataset_name": "openbookqa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "Skylion007/openwebtext": {"dataset_name": "Skylion007/openwebtext", "description": "An open-source replication of the WebText dataset from OpenAI.", "downloads": 7369, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"Port-au-Prince, Haiti (CNN) -- Earthquake victims...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "An open-source replication of the WebText dataset from OpenAI.\n", "dataset_name": "Skylion007/openwebtext"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "opinosis": {"dataset_name": "opinosis", "description": "The Opinosis Opinion Dataset consists of sentences extracted from reviews for 51 topics.\nTopics and opinions are obtained from Tripadvisor, Edmunds.com and Amazon.com.", "downloads": 765, "configs": {"default": {"config_name": "default", "sample_row": "{\"review_sents\": \"\\\", and is very, very accurate .\\\\r\\\\n but for the mo...\", \"summaries\": \"[\\\"This unit is generally quite accurate. \\\\r\\\\nSet-...\"}", "columns": ["review_sents", "summaries"], "columns_mapping": {"review_sents": "review_sents", "summaries": "summaries"}, "dataset_description": "\nThe Opinosis Opinion Dataset consists of sentences extracted from reviews for 51 topics.\nTopics and opinions are obtained from Tripadvisor, Edmunds.com and Amazon.com.\n", "dataset_name": "opinosis"}}, "tags": ["task_categories:summarization", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "abstractive-summarization"], "is_gated": false}, "opus_books": {"dataset_name": "opus_books", "description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M", "downloads": 22187, "configs": {"ca-de": {"config_name": "ca-de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.de\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_ca", "translation_de"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.de": "translation_de"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "ca-en": {"config_name": "ca-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_ca", "translation_en"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.en": "translation_en"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-en": {"config_name": "de-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_de", "translation_en"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-en": {"config_name": "el-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_el", "translation_en"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.en": "translation_en"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-eo": {"config_name": "de-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Translation: Antonie Zimmermann\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\"}", "columns": ["id", "translation_de", "translation_eo"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.eo": "translation_eo"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-eo": {"config_name": "en-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"ALICE'S ADVENTURES IN WONDERLAND\\\"\", \"translation.eo\": \"\\\"La aventuroj de Alicio en Mirlando\\\"\"}", "columns": ["id", "translation_en", "translation_eo"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.eo": "translation_eo"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-es": {"config_name": "de-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.es\": \"\\\"Source: http://librosgratis.liblit.com/\\\"\"}", "columns": ["id", "translation_de", "translation_es"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-es": {"config_name": "el-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.es\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_el", "translation_es"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-es": {"config_name": "en-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.es\": \"\\\"Source: Wikisource & librodot.com\\\"\"}", "columns": ["id", "translation_en", "translation_es"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-es": {"config_name": "eo-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.es\": \"\\\"Source: http://mimosa.pntic.mec.es/\\\"\"}", "columns": ["id", "translation_eo", "translation_es"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.es": "translation_es"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: manybooks.netAudiobook available here\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_en", "translation_fi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fi": "translation_fi"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-fi": {"config_name": "es-fi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: Wikisource & bibliotecasvirtuales.com\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_es", "translation_fi"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.fi": "translation_fi"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.fr\": \"\\\"Source: Project GutenbergTranslation: No\\\\u00ebmie...\"}", "columns": ["id", "translation_de", "translation_fr"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-fr": {"config_name": "el-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.fr\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_el", "translation_fr"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"The Wanderer\\\"\", \"translation.fr\": \"\\\"Le grand Meaulnes\\\"\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-fr": {"config_name": "eo-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.fr\": \"\\\"Source: WikisourceTranslation: Henri Bu\\\\u00e9\\\"\"}", "columns": ["id", "translation_eo", "translation_fr"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: http://librosgratis.liblit.com/\\\"\", \"translation.fr\": \"\\\"Source: Project GutenbergTranslation: No\\\\u00ebmie...\"}", "columns": ["id", "translation_es", "translation_fr"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fi-fr": {"config_name": "fi-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.fr\": \"\\\"Source: http://www.ebooksgratuits.com/\\\"\"}", "columns": ["id", "translation_fi", "translation_fr"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.fr": "translation_fr"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "ca-hu": {"config_name": "ca-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Koroknay Istv\\\\u00...\"}", "columns": ["id", "translation_ca", "translation_hu"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-hu": {"config_name": "de-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Ruzitska M\\\\u00e1r...\"}", "columns": ["id", "translation_de", "translation_hu"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "el-hu": {"config_name": "el-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Kem\\\\u00e9ny G\\\\u00...\"}", "columns": ["id", "translation_el", "translation_hu"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-hu": {"config_name": "en-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Szenczi Mikl\\\\u00f...\"}", "columns": ["id", "translation_en", "translation_hu"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-hu": {"config_name": "eo-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.hu\": \"\\\"Source: mek.oszk.huAudiobook available here\\\"\"}", "columns": ["id", "translation_eo", "translation_hu"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-hu": {"config_name": "fr-hu", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: ebooksgratuits.comTranslation: V. Leconte...\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Szenczi Mikl\\\\u00f...\"}", "columns": ["id", "translation_fr", "translation_hu"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.hu": "translation_hu"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-it": {"config_name": "de-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: http://www.zeno.org - Contumax GmbH & Co....\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_de", "translation_it"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-it": {"config_name": "en-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_en", "translation_it"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-it": {"config_name": "eo-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.it\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_eo", "translation_it"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-it": {"config_name": "es-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: http://librosgratis.liblit.com/\\\"\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_es", "translation_it"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-it": {"config_name": "fr-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: Project GutenbergTranslation: No\\\\u00ebmie...\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_fr", "translation_it"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-it": {"config_name": "hu-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Ruzitska M\\\\u00e1r...\", \"translation.it\": \"\\\"Source: www.liberliber.it/Audiobook available her...\"}", "columns": ["id", "translation_hu", "translation_it"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.it": "translation_it"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "ca-nl": {"config_name": "ca-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Source: Project GutenbergTranslation: Josep Carne...\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_ca", "translation_nl"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-nl": {"config_name": "de-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_de", "translation_nl"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-nl": {"config_name": "en-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.nl\": \"\\\"Source: Project GutenbergTranslation: Gonne Van U...\"}", "columns": ["id", "translation_en", "translation_nl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-nl": {"config_name": "es-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: Wikisource & librodot.com\\\"\", \"translation.nl\": \"\\\"Source: Project GutenbergTranslation: Gonne Van U...\"}", "columns": ["id", "translation_es", "translation_nl"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-nl": {"config_name": "fr-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_fr", "translation_nl"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-nl": {"config_name": "hu-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: http://mek.oszk.hu/Translation: Tam\\\\u00e1...\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_hu", "translation_nl"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-nl": {"config_name": "it-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Source: WikisourceTranslation: Gaetano Barbieri\\\"...\", \"translation.nl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_it", "translation_nl"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.nl": "translation_nl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-no": {"config_name": "en-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: manybooks.netAudiobook available here\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_en", "translation_no"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-no": {"config_name": "es-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: Wikisource & bibliotecasvirtuales.com\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_es", "translation_no"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fi-no": {"config_name": "fi-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_fi", "translation_no"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-no": {"config_name": "fr-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: http://www.ebooksgratuits.com/\\\"\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_fr", "translation_no"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-no": {"config_name": "hu-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Antal \\\\u00c1rkos\\\"...\", \"translation.no\": \"\\\"Source: ebook made by Lars I. N\\\\u00e6sheimTransla...\"}", "columns": ["id", "translation_hu", "translation_no"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.no": "translation_no"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-pl": {"config_name": "en-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: manybooks.netAudiobook available here\\\"\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_en", "translation_pl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fi-pl": {"config_name": "fi-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_fi", "translation_pl"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-pl": {"config_name": "fr-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: http://www.ebooksgratuits.com/\\\"\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_fr", "translation_pl"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-pl": {"config_name": "hu-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huTranslation: Antal \\\\u00c1rkos\\\"...\", \"translation.pl\": \"\\\"Source: Project Gutenberg\\\"\"}", "columns": ["id", "translation_hu", "translation_pl"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.pl": "translation_pl"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-pt": {"config_name": "de-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Translation: Antonie Zimmermann\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_de", "translation_pt"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-pt": {"config_name": "en-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"ALICE'S ADVENTURES IN WONDERLAND\\\"\", \"translation.pt\": \"\\\"Alice no Pa\\\\u00eds das Maravilhas\\\"\"}", "columns": ["id", "translation_en", "translation_pt"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "eo-pt": {"config_name": "eo-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.eo\": \"\\\"Source: Project GutenbergTranslation: E.L. KEARNE...\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_eo", "translation_pt"], "columns_mapping": {"id": "id", "translation.eo": "translation_eo", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-pt": {"config_name": "es-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Source: http://mimosa.pntic.mec.es/\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_es", "translation_pt"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-pt": {"config_name": "fr-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: WikisourceTranslation: Henri Bu\\\\u00e9\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_fr", "translation_pt"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-pt": {"config_name": "hu-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"Source: mek.oszk.huAudiobook available here\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_hu", "translation_pt"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-pt": {"config_name": "it-pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Source: Project Gutenberg\\\"\", \"translation.pt\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_it", "translation_pt"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.pt": "translation_pt"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "de-ru": {"config_name": "de-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Anna Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_de", "translation_ru"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Anna Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_en", "translation_ru"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "es-ru": {"config_name": "es-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Ana Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_es", "translation_ru"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"LE MA\\\\u00ceTRE ET MARGUERITE\\\"\", \"translation.ru\": \"\\\"\\\\u041c\\\\u0430\\\\u0441\\\\u0442\\\\u0435\\\\u0440 \\\\u0438 \\\\u041...\"}", "columns": ["id", "translation_fr", "translation_ru"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "hu-ru": {"config_name": "hu-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hu\": \"\\\"A Mester \\\\u00e9s Margarita\\\"\", \"translation.ru\": \"\\\"\\\\u041c\\\\u0430\\\\u0441\\\\u0442\\\\u0435\\\\u0440 \\\\u0438 \\\\u041...\"}", "columns": ["id", "translation_hu", "translation_ru"], "columns_mapping": {"id": "id", "translation.hu": "translation_hu", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-ru": {"config_name": "it-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Anna Karenina\\\"\", \"translation.ru\": \"\\\"\\\\u0410\\\\u043d\\\\u043d\\\\u0430 \\\\u041a\\\\u0430\\\\u0440\\\\u0435...\"}", "columns": ["id", "translation_it", "translation_ru"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.ru": "translation_ru"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "en-sv": {"config_name": "en-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Source: Project GutenbergAudiobook available here...\", \"translation.sv\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_en", "translation_sv"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sv": "translation_sv"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Source: ebooksgratuits.comAudiobook available her...\", \"translation.sv\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_fr", "translation_sv"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}, "it-sv": {"config_name": "it-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Translation: Silvio Spaventa Filippi\\\"\", \"translation.sv\": \"\\\"Source: Wikisource\\\"\"}", "columns": ["id", "translation_it", "translation_sv"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.sv": "translation_sv"}, "dataset_description": "This is a collection of copyright free books aligned by Andras Farkas, which are available from http://www.farkastranslations.com/bilingual_books.php\nNote that the texts are rather dated due to copyright issues and that some of them are manually reviewed (check the meta-data at the top of the corpus files in XML). The source is multilingually aligned, which is available from http://www.farkastranslations.com/bilingual_books.php. In OPUS, the alignment is formally bilingual but the multilingual alignment can be recovered from the XCES sentence alignment files. Note also that the alignment units from the original source may include multi-sentence paragraphs, which are split and sentence-aligned in OPUS.\nAll texts are freely available for personal, educational and research use. Commercial use (e.g. reselling as parallel books) and mass redistribution without explicit permission are not granted. Please acknowledge the source when using the data!\n\n16 languages, 64 bitexts\ntotal number of files: 158\ntotal number of tokens: 19.50M\ntotal number of sentence fragments: 0.91M\n", "dataset_name": "opus_books"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:ca", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:fi", "language:fr", "language:hu", "language:it", "language:nl", "language:no", "language:pl", "language:pt", "language:ru", "language:sv"], "is_gated": false}, "opus_dgt": {"dataset_name": "opus_dgt", "description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M", "downloads": 1536, "configs": {"bg-ga": {"config_name": "bg-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u041f\\\\u0440\\\\u043e\\\\u0442\\\\u043e\\\\u043a\\\\u043e\\\\u043b ...\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\"}", "columns": ["id", "translation_bg", "translation_ga"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.ga": "translation_ga"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "bg-hr": {"config_name": "bg-hr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u0420\\\\u0435\\\\u0448\\\\u0435\\\\u043d\\\\u0438\\\\u0435 \\\\u043d...\", \"translation.hr\": \"\\\"Odluka Zajedni\\\\u010dkog odbora EGP-a\\\"\"}", "columns": ["id", "translation_bg", "translation_hr"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.hr": "translation_hr"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "bg-sh": {"config_name": "bg-sh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u041f\\\\u0440\\\\u043e\\\\u0442\\\\u043e\\\\u043a\\\\u043e\\\\u043b ...\", \"translation.sh\": \"\\\"Ispravak Drugog dodatnog protokola uz Sporazum o ...\"}", "columns": ["id", "translation_bg", "translation_sh"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.sh": "translation_sh"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "fi-ga": {"config_name": "fi-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Oikaisup\\\\u00f6yt\\\\u00e4kirja yleissopimukseen tuom...\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\"}", "columns": ["id", "translation_fi", "translation_ga"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.ga": "translation_ga"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "es-ga": {"config_name": "es-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Acta de correcci\\\\u00f3n de errores del Convenio r...\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\"}", "columns": ["id", "translation_es", "translation_ga"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.ga": "translation_ga"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "ga-sh": {"config_name": "ga-sh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ga\": \"\\\"Leasuithe ar na hIarscr\\\\u00edbhinn\\\\u00ed a ghabha...\", \"translation.sh\": \"\\\"Izmjene prilog\\\\u00e2 Konvenciji iz Lugana od 30. ...\"}", "columns": ["id", "translation_ga", "translation_sh"], "columns_mapping": {"id": "id", "translation.ga": "translation_ga", "translation.sh": "translation_sh"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "hr-sk": {"config_name": "hr-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hr\": \"\\\"Odluka Zajedni\\\\u010dkog odbora EGP-a\\\"\", \"translation.sk\": \"\\\"Rozhodnutie Spolo\\\\u010dn\\\\u00e9ho v\\\\u00fdboru EHP\\\"...\"}", "columns": ["id", "translation_hr", "translation_sk"], "columns_mapping": {"id": "id", "translation.hr": "translation_hr", "translation.sk": "translation_sk"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "mt-sh": {"config_name": "mt-sh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.mt\": \"\\\"Verbal ta' rettifika tat-tieni protokoll addizzjo...\", \"translation.sh\": \"\\\"Ispravak Drugog dodatnog protokola uz Sporazum o ...\"}", "columns": ["id", "translation_mt", "translation_sh"], "columns_mapping": {"id": "id", "translation.mt": "translation_mt", "translation.sh": "translation_sh"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "hr-sv": {"config_name": "hr-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.hr\": \"\\\"Odluka Zajedni\\\\u010dkog odbora EGP-a\\\"\", \"translation.sv\": \"\\\"Gemensamma EES-kommitt\\\\u00e9ns beslut\\\"\"}", "columns": ["id", "translation_hr", "translation_sv"], "columns_mapping": {"id": "id", "translation.hr": "translation_hr", "translation.sv": "translation_sv"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}, "ga-nl": {"config_name": "ga-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ga\": \"\\\"Miontuairisc cheartaitheach maidir le Coinbhinsi\\\\...\", \"translation.nl\": \"\\\"Proces-verbaal van verbetering van het Verdrag be...\"}", "columns": ["id", "translation_ga", "translation_nl"], "columns_mapping": {"id": "id", "translation.ga": "translation_ga", "translation.nl": "translation_nl"}, "dataset_description": "A collection of translation memories provided by the JRC. Source: https://ec.europa.eu/jrc/en/language-technologies/dgt-translation-memory\n25 languages, 299 bitexts\ntotal number of files: 817,410\ntotal number of tokens: 2.13G\ntotal number of sentence fragments: 113.52M\n", "dataset_name": "opus_dgt"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:ga", "language:hr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:ro", "language:sh", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "opus_dogc": {"dataset_name": "opus_dogc", "description": "This is a collection of documents from the Official Journal of the Government of Catalonia, in Catalan and Spanish languages, provided by Antoni Oliver Gonzalez from the Universitat Oberta de Catalunya.", "downloads": 283, "configs": {"tmx": {"config_name": "tmx", "sample_row": "{\"translation.ca\": \"\\\"En virtut de l ' annex 1 del Reial decret 2346 / ...\", \"translation.es\": \"\\\"En virtud del anexo 1 del Real decreto 2346/ 1996...\"}", "columns": ["translation_ca", "translation_es"], "columns_mapping": {"translation.ca": "translation_ca", "translation.es": "translation_es"}, "dataset_description": "This is a collection of documents from the Official Journal of the Government of Catalonia, in Catalan and Spanish languages, provided by Antoni Oliver Gonzalez from the Universitat Oberta de Catalunya.\n", "dataset_name": "opus_dogc"}}, "tags": ["task_categories:translation", "annotations_creators:no-annotation", "multilinguality:translation", "source_datasets:original", "language:ca", "language:es"], "is_gated": false}, "opus_gnome": {"dataset_name": "opus_gnome", "description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M", "downloads": 1556, "configs": {"ar-bal": {"config_name": "ar-bal", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ar\": \"\\\"\\\\u0625\\\\u0639\\\\u062f\\\\u0627\\\\u062f \\\\u0633\\\\u064a\\\\u0627...\", \"translation.bal\": \"\\\"\\\\u062a\\\\u0646\\\\u0638\\\\u06cc\\\\u0645 \\\\u06a9\\\\u062a\\\\u0646...\"}", "columns": ["id", "translation_ar", "translation_bal"], "columns_mapping": {"id": "id", "translation.ar": "translation_ar", "translation.bal": "translation_bal"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "bg-csb": {"config_name": "bg-csb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"GNOME\\\"\", \"translation.csb\": \"\\\"GNOME\\\"\"}", "columns": ["id", "translation_bg", "translation_csb"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.csb": "translation_csb"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "ca-en_GB": {"config_name": "ca-en_GB", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ca\": \"\\\"Accerciser\\\"\", \"translation.en_GB\": \"\\\"Accerciser\\\"\"}", "columns": ["id", "translation_ca", "translation_en_GB"], "columns_mapping": {"id": "id", "translation.ca": "translation_ca", "translation.en_GB": "translation_en_GB"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "cs-eo": {"config_name": "cs-eo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"Seznam ve v\\\\u00fdchoz\\\\u00edm nastaven\\\\u00ed zak\\\\u...\", \"translation.eo\": \"\\\"Listo de kromprogramoj kiuj defa\\\\u016dlte estas e...\"}", "columns": ["id", "translation_cs", "translation_eo"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.eo": "translation_eo"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "de-ha": {"config_name": "de-ha", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Info zu GNOME\\\"\", \"translation.ha\": \"\\\"Game da GNOME\\\"\"}", "columns": ["id", "translation_de", "translation_ha"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.ha": "translation_ha"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "cs-tk": {"config_name": "cs-tk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"GNOME\\\"\", \"translation.tk\": \"\\\"GNOME\\\"\"}", "columns": ["id", "translation_cs", "translation_tk"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.tk": "translation_tk"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "da-vi": {"config_name": "da-vi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.da\": \"\\\"Giv dit program en tilg\\\\u00e6ngelighedsoverhaling...\", \"translation.vi\": \"\\\"Th\\\\u1eed ra kh\\\\u1ea3 n\\\\u0103ng truy c\\\\u1eadp c\\\\u1...\"}", "columns": ["id", "translation_da", "translation_vi"], "columns_mapping": {"id": "id", "translation.da": "translation_da", "translation.vi": "translation_vi"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "en_GB-my": {"config_name": "en_GB-my", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en_GB\": \"\\\"Results %i\\\\u2013%i (out of %i)\\\"\", \"translation.my\": \"\\\"\\\\u101b\\\\u101c\\\\u1012\\\\u103a %i\\\\u2013%i ( %i \\\\u1019\\\\u...\"}", "columns": ["id", "translation_en_GB", "translation_my"], "columns_mapping": {"id": "id", "translation.en_GB": "translation_en_GB", "translation.my": "translation_my"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "el-sk": {"config_name": "el-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"Accerciser\\\"\", \"translation.sk\": \"\\\"Accerciser\\\"\"}", "columns": ["id", "translation_el", "translation_sk"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.sk": "translation_sk"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}, "de-tt": {"config_name": "de-tt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Ausgew\\\\u00e4hlter Link\\\"\", \"translation.tt\": \"\\\"Saylan\\\\u011fan B\\\\u00e4y\\\"\"}", "columns": ["id", "translation_de", "translation_tt"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.tt": "translation_tt"}, "dataset_description": "A parallel corpus of GNOME localization files. Source: https://l10n.gnome.org\n\n187 languages, 12,822 bitexts\ntotal number of files: 113,344\ntotal number of tokens: 267.27M\ntotal number of sentence fragments: 58.12M\n", "dataset_name": "opus_gnome"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:am", "language:an", "language:ang", "language:ar", "language:as", "language:ast", "language:az", "language:bal", "language:be", "language:bem", "language:bg", "language:bn", "language:bo", "language:br", "language:brx", "language:bs", "language:ca", "language:crh", "language:cs", "language:csb", "language:cy", "language:da", "language:de", "language:dv", "language:dz", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fo", "language:fr", "language:fur", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:gu", "language:gv", "language:ha", "language:he", "language:hi", "language:hr", "language:hu", "language:hy", "language:ia", "language:id", "language:ig", "language:io", "language:is", "language:it", "language:ja", "language:jbo", "language:ka", "language:kg", "language:kk", "language:km", "language:kn", "language:ko", "language:kr", "language:ks", "language:ku", "language:ky", "language:la", "language:lg", "language:li", "language:lo", "language:lt", "language:lv", "language:mai", "language:mg", "language:mi", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:mus", "language:my", "language:nb", "language:nds", "language:ne", "language:nhn", "language:nl", "language:nn", "language:no", "language:nqo", "language:nr", "language:nso", "language:oc", "language:or", "language:os", "language:pa", "language:pl", "language:ps", "language:pt", "language:quz", "language:ro", "language:ru", "language:rw", "language:si", "language:sk", "language:sl", "language:so", "language:sq", "language:sr", "language:st", "language:sv", "language:sw", "language:szl", "language:ta", "language:te", "language:tg", "language:th", "language:tk", "language:tl", "language:tr", "language:ts", "language:tt", "language:tyj", "language:ug", "language:uk", "language:ur", "language:uz", "language:vi", "language:wa", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "opus_infopankki": {"dataset_name": "opus_infopankki", "description": "A parallel corpus of 12 languages, 66 bitexts.", "downloads": 9758, "configs": {"ar-en": {"config_name": "ar-en", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.en\": \"\\\"Basic information\\\"\"}", "columns": ["translation_ar", "translation_en"], "columns_mapping": {"translation.ar": "translation_ar", "translation.en": "translation_en"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-es": {"config_name": "ar-es", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\"}", "columns": ["translation_ar", "translation_es"], "columns_mapping": {"translation.ar": "translation_ar", "translation.es": "translation_es"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-et": {"config_name": "ar-et", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\"}", "columns": ["translation_ar", "translation_et"], "columns_mapping": {"translation.ar": "translation_ar", "translation.et": "translation_et"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-fa": {"config_name": "ar-fa", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_ar", "translation_fa"], "columns_mapping": {"translation.ar": "translation_ar", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-fi": {"config_name": "ar-fi", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.fi\": \"\\\"Perustietoa\\\"\"}", "columns": ["translation_ar", "translation_fi"], "columns_mapping": {"translation.ar": "translation_ar", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-fr": {"config_name": "ar-fr", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_ar", "translation_fr"], "columns_mapping": {"translation.ar": "translation_ar", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-ru": {"config_name": "ar-ru", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.ru\": \"\\\"\\\\u041e\\\\u0441\\\\u043d\\\\u043e\\\\u0432\\\\u043d\\\\u0430\\\\u044f ...\"}", "columns": ["translation_ar", "translation_ru"], "columns_mapping": {"translation.ar": "translation_ar", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-so": {"config_name": "ar-so", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_ar", "translation_so"], "columns_mapping": {"translation.ar": "translation_ar", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-sv": {"config_name": "ar-sv", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0645\\\\u0639\\\\u0644\\\\u0648\\\\u0645\\\\u0627\\\\u062a \\\\u0623...\", \"translation.sv\": \"\\\"Historia Trafik\\\"\"}", "columns": ["translation_ar", "translation_sv"], "columns_mapping": {"translation.ar": "translation_ar", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-tr": {"config_name": "ar-tr", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_ar", "translation_tr"], "columns_mapping": {"translation.ar": "translation_ar", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ar-zh": {"config_name": "ar-zh", "sample_row": "{\"translation.ar\": \"\\\"\\\\u0627\\\\u0644\\\\u0645\\\\u062d\\\\u062a\\\\u0648\\\\u0649 \\\\u063a...\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_ar", "translation_zh"], "columns_mapping": {"translation.ar": "translation_ar", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-es": {"config_name": "en-es", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\"}", "columns": ["translation_en", "translation_es"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-et": {"config_name": "en-et", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\"}", "columns": ["translation_en", "translation_et"], "columns_mapping": {"translation.en": "translation_en", "translation.et": "translation_et"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-fa": {"config_name": "en-fa", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_en", "translation_fa"], "columns_mapping": {"translation.en": "translation_en", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"translation.en\": \"\\\"Marriage\\\"\", \"translation.fi\": \"\\\"Avioliitto\\\"\"}", "columns": ["translation_en", "translation_fi"], "columns_mapping": {"translation.en": "translation_en", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"translation.en\": \"\\\"Marriage\\\"\", \"translation.fr\": \"\\\"Mariage\\\"\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"translation.en\": \"\\\"Marriage\\\"\", \"translation.ru\": \"\\\"\\\\u0411\\\\u0440\\\\u0430\\\\u043a\\\"\"}", "columns": ["translation_en", "translation_ru"], "columns_mapping": {"translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-so": {"config_name": "en-so", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_en", "translation_so"], "columns_mapping": {"translation.en": "translation_en", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-sv": {"config_name": "en-sv", "sample_row": "{\"translation.en\": \"\\\"Basic information\\\"\", \"translation.sv\": \"\\\"Grundl\\\\u00e4ggande information\\\"\"}", "columns": ["translation_en", "translation_sv"], "columns_mapping": {"translation.en": "translation_en", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-tr": {"config_name": "en-tr", "sample_row": "{\"translation.en\": \"\\\"Content is not available in the selected language...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_en", "translation_tr"], "columns_mapping": {"translation.en": "translation_en", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"translation.en\": \"\\\"Please select another language.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_en", "translation_zh"], "columns_mapping": {"translation.en": "translation_en", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-et": {"config_name": "es-et", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\"}", "columns": ["translation_es", "translation_et"], "columns_mapping": {"translation.es": "translation_es", "translation.et": "translation_et"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-fa": {"config_name": "es-fa", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_es", "translation_fa"], "columns_mapping": {"translation.es": "translation_es", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-fi": {"config_name": "es-fi", "sample_row": "{\"translation.es\": \"\\\"Todos los textos publicados en las p\\\\u00e1ginas w...\", \"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\"}", "columns": ["translation_es", "translation_fi"], "columns_mapping": {"translation.es": "translation_es", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_es", "translation_fr"], "columns_mapping": {"translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-ru": {"config_name": "es-ru", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\"}", "columns": ["translation_es", "translation_ru"], "columns_mapping": {"translation.es": "translation_es", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-so": {"config_name": "es-so", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_es", "translation_so"], "columns_mapping": {"translation.es": "translation_es", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-sv": {"config_name": "es-sv", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_es", "translation_sv"], "columns_mapping": {"translation.es": "translation_es", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-tr": {"config_name": "es-tr", "sample_row": "{\"translation.es\": \"\\\"El contenido no est\\\\u00e1 disponible en el idioma...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_es", "translation_tr"], "columns_mapping": {"translation.es": "translation_es", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "es-zh": {"config_name": "es-zh", "sample_row": "{\"translation.es\": \"\\\"Por favor seleccione otro idioma.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_es", "translation_zh"], "columns_mapping": {"translation.es": "translation_es", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-fa": {"config_name": "et-fa", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav. Palu...\", \"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\"}", "columns": ["translation_et", "translation_fa"], "columns_mapping": {"translation.et": "translation_et", "translation.fa": "translation_fa"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-fi": {"config_name": "et-fi", "sample_row": "{\"translation.et\": \"\\\"P\\\\u00f5hiteave\\\"\", \"translation.fi\": \"\\\"Perustietoa\\\"\"}", "columns": ["translation_et", "translation_fi"], "columns_mapping": {"translation.et": "translation_et", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-fr": {"config_name": "et-fr", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_et", "translation_fr"], "columns_mapping": {"translation.et": "translation_et", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-ru": {"config_name": "et-ru", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\"}", "columns": ["translation_et", "translation_ru"], "columns_mapping": {"translation.et": "translation_et", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-so": {"config_name": "et-so", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_et", "translation_so"], "columns_mapping": {"translation.et": "translation_et", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-sv": {"config_name": "et-sv", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_et", "translation_sv"], "columns_mapping": {"translation.et": "translation_et", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-tr": {"config_name": "et-tr", "sample_row": "{\"translation.et\": \"\\\"Sisu ei ole valitud keeles k\\\\u00e4ttesaadav.\\\"\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_et", "translation_tr"], "columns_mapping": {"translation.et": "translation_et", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "et-zh": {"config_name": "et-zh", "sample_row": "{\"translation.et\": \"\\\"Palun vali m\\\\u00f5ni teine keel.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_et", "translation_zh"], "columns_mapping": {"translation.et": "translation_et", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-fi": {"config_name": "fa-fi", "sample_row": "{\"translation.fa\": \"\\\"\\\\u062a\\\\u0645\\\\u0627\\\\u0645 \\\\u0645\\\\u0637\\\\u0627\\\\u0644...\", \"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\"}", "columns": ["translation_fa", "translation_fi"], "columns_mapping": {"translation.fa": "translation_fa", "translation.fi": "translation_fi"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-fr": {"config_name": "fa-fr", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\"}", "columns": ["translation_fa", "translation_fr"], "columns_mapping": {"translation.fa": "translation_fa", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-ru": {"config_name": "fa-ru", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\"}", "columns": ["translation_fa", "translation_ru"], "columns_mapping": {"translation.fa": "translation_fa", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-so": {"config_name": "fa-so", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_fa", "translation_so"], "columns_mapping": {"translation.fa": "translation_fa", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-sv": {"config_name": "fa-sv", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_fa", "translation_sv"], "columns_mapping": {"translation.fa": "translation_fa", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-tr": {"config_name": "fa-tr", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_fa", "translation_tr"], "columns_mapping": {"translation.fa": "translation_fa", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fa-zh": {"config_name": "fa-zh", "sample_row": "{\"translation.fa\": \"\\\"\\\\u0627\\\\u06cc\\\\u0646 \\\\u0645\\\\u0637\\\\u0644\\\\u0628 \\\\u062...\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_fa", "translation_zh"], "columns_mapping": {"translation.fa": "translation_fa", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-fr": {"config_name": "fi-fr", "sample_row": "{\"translation.fi\": \"\\\"Avioliitto\\\"\", \"translation.fr\": \"\\\"Mariage\\\"\"}", "columns": ["translation_fi", "translation_fr"], "columns_mapping": {"translation.fi": "translation_fi", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-ru": {"config_name": "fi-ru", "sample_row": "{\"translation.fi\": \"\\\"Avioliitto\\\"\", \"translation.ru\": \"\\\"\\\\u0411\\\\u0440\\\\u0430\\\\u043a\\\"\"}", "columns": ["translation_fi", "translation_ru"], "columns_mapping": {"translation.fi": "translation_fi", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-so": {"config_name": "fi-so", "sample_row": "{\"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\", \"translation.so\": \"\\\"Dhammaan qoraallada ka kooban luqadaha kala duwan...\"}", "columns": ["translation_fi", "translation_so"], "columns_mapping": {"translation.fi": "translation_fi", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-sv": {"config_name": "fi-sv", "sample_row": "{\"translation.fi\": \"\\\"Perustietoa\\\"\", \"translation.sv\": \"\\\"Grundl\\\\u00e4ggande information\\\"\"}", "columns": ["translation_fi", "translation_sv"], "columns_mapping": {"translation.fi": "translation_fi", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-tr": {"config_name": "fi-tr", "sample_row": "{\"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\", \"translation.tr\": \"\\\"InfoFinland'\\\\u0131n internet sayfalar\\\\u0131nda hi...\"}", "columns": ["translation_fi", "translation_tr"], "columns_mapping": {"translation.fi": "translation_fi", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fi-zh": {"config_name": "fi-zh", "sample_row": "{\"translation.fi\": \"\\\"Kaikki InfoFinlandin verkkosivuilla julkaistut te...\", \"translation.zh\": \"\\\"InfoFinland\\\\u7f51\\\\u7ad9\\\\u4e0a\\\\u6240\\\\u6709\\\\u8bed\\\\u...\"}", "columns": ["translation_fi", "translation_zh"], "columns_mapping": {"translation.fi": "translation_fi", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"translation.fr\": \"\\\"Mariage\\\"\", \"translation.ru\": \"\\\"\\\\u0411\\\\u0440\\\\u0430\\\\u043a\\\"\"}", "columns": ["translation_fr", "translation_ru"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-so": {"config_name": "fr-so", "sample_row": "{\"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_fr", "translation_so"], "columns_mapping": {"translation.fr": "translation_fr", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_fr", "translation_sv"], "columns_mapping": {"translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-tr": {"config_name": "fr-tr", "sample_row": "{\"translation.fr\": \"\\\"Le contenu n'est pas disponible dans la langue s\\\\...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_fr", "translation_tr"], "columns_mapping": {"translation.fr": "translation_fr", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "fr-zh": {"config_name": "fr-zh", "sample_row": "{\"translation.fr\": \"\\\"Veuillez s\\\\u00e9lectionner une autre langue.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_fr", "translation_zh"], "columns_mapping": {"translation.fr": "translation_fr", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-so": {"config_name": "ru-so", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\", \"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\"}", "columns": ["translation_ru", "translation_so"], "columns_mapping": {"translation.ru": "translation_ru", "translation.so": "translation_so"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-sv": {"config_name": "ru-sv", "sample_row": "{\"translation.ru\": \"\\\"\\\\u041e\\\\u0441\\\\u043d\\\\u043e\\\\u0432\\\\u043d\\\\u0430\\\\u044f ...\", \"translation.sv\": \"\\\"Grundl\\\\u00e4ggande information\\\"\"}", "columns": ["translation_ru", "translation_sv"], "columns_mapping": {"translation.ru": "translation_ru", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-tr": {"config_name": "ru-tr", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0418\\\\u043d\\\\u0444\\\\u043e\\\\u0440\\\\u043c\\\\u0430\\\\u0446\\\\...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_ru", "translation_tr"], "columns_mapping": {"translation.ru": "translation_ru", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "ru-zh": {"config_name": "ru-zh", "sample_row": "{\"translation.ru\": \"\\\"\\\\u041f\\\\u043e\\\\u043f\\\\u0440\\\\u043e\\\\u0431\\\\u0443\\\\u0439\\\\...\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_ru", "translation_zh"], "columns_mapping": {"translation.ru": "translation_ru", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "so-sv": {"config_name": "so-sv", "sample_row": "{\"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\", \"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\"}", "columns": ["translation_so", "translation_sv"], "columns_mapping": {"translation.so": "translation_so", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "so-tr": {"config_name": "so-tr", "sample_row": "{\"translation.so\": \"\\\"Waxyaabaha halkan ku jira laguma helayo luqada aa...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_so", "translation_tr"], "columns_mapping": {"translation.so": "translation_so", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "so-zh": {"config_name": "so-zh", "sample_row": "{\"translation.so\": \"\\\"Xulo luqad kale.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_so", "translation_zh"], "columns_mapping": {"translation.so": "translation_so", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "sv-tr": {"config_name": "sv-tr", "sample_row": "{\"translation.sv\": \"\\\"Detta inneh\\\\u00e5ll finns inte p\\\\u00e5 det spr\\\\u0...\", \"translation.tr\": \"\\\"Se\\\\u00e7mi\\\\u015f oldu\\\\u011funuz dilde bir i\\\\u00e7...\"}", "columns": ["translation_sv", "translation_tr"], "columns_mapping": {"translation.sv": "translation_sv", "translation.tr": "translation_tr"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "sv-zh": {"config_name": "sv-zh", "sample_row": "{\"translation.sv\": \"\\\"V\\\\u00e4lj n\\\\u00e5got annat spr\\\\u00e5k.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_sv", "translation_zh"], "columns_mapping": {"translation.sv": "translation_sv", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}, "tr-zh": {"config_name": "tr-zh", "sample_row": "{\"translation.tr\": \"\\\"L\\\\u00fctfen, ba\\\\u015fka bir dil se\\\\u00e7iniz.\\\"\", \"translation.zh\": \"\\\"\\\\u60a8\\\\u6240\\\\u9009\\\\u7684\\\\u8bed\\\\u8a00\\\\u6ca1\\\\u6709\\\\...\"}", "columns": ["translation_tr", "translation_zh"], "columns_mapping": {"translation.tr": "translation_tr", "translation.zh": "translation_zh"}, "dataset_description": "A parallel corpus of 12 languages, 66 bitexts.\n", "dataset_name": "opus_infopankki"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:ar", "language:en", "language:es", "language:et", "language:fa", "language:fi", "language:fr", "language:ru", "language:so", "language:sv", "language:tr", "language:zh"], "is_gated": false}, "opus_memat": {"dataset_name": "opus_memat", "description": "Xhosa-English parallel corpora, funded by EPSRC, the Medical Machine Translation project worked on machine translation between ixiXhosa and English, with a focus on the medical domain.", "downloads": 285, "configs": {"xh-en": {"config_name": "xh-en", "sample_row": "{\"translation.xh\": \"\\\"Kwathi emva kokufa kukaSawule, uDavide ebuyile ek...\", \"translation.en\": \"\\\"It happened after the death of Saul, when David w...\"}", "columns": ["translation_xh", "translation_en"], "columns_mapping": {"translation.xh": "translation_xh", "translation.en": "translation_en"}, "dataset_description": "Xhosa-English parallel corpora, funded by EPSRC, the Medical Machine Translation project worked on machine translation between ixiXhosa and English, with a focus on the medical domain.", "dataset_name": "opus_memat"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:xh"], "is_gated": false}, "opus_montenegrinsubs": {"dataset_name": "opus_montenegrinsubs", "description": "Opus MontenegrinSubs dataset for machine translation task, for language pair en-me: english and montenegrin", "downloads": 285, "configs": {"en-me": {"config_name": "en-me", "sample_row": "{\"translation.en\": \"\\\"Season 1 Episode 1 Pilot (Dimension)\\\"\", \"translation.me\": \"\\\"OPASNE IGRE Pilot epizoda\\\"\"}", "columns": ["translation_en", "translation_me"], "columns_mapping": {"translation.en": "translation_en", "translation.me": "translation_me"}, "dataset_description": "Opus MontenegrinSubs dataset for machine translation task, for language pair en-me: english and montenegrin\n", "dataset_name": "opus_montenegrinsubs"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:cnr", "language:en"], "is_gated": false}, "opus_openoffice": {"dataset_name": "opus_openoffice", "description": "A collection of documents from http://www.openoffice.org/.", "downloads": 4103, "configs": {"de-en_GB": {"config_name": "de-en_GB", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\"}", "columns": ["translation_de", "translation_en_GB"], "columns_mapping": {"translation.de": "translation_de", "translation.en_GB": "translation_en_GB"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-es": {"config_name": "de-es", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\"}", "columns": ["translation_de", "translation_es"], "columns_mapping": {"translation.de": "translation_de", "translation.es": "translation_es"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\"}", "columns": ["translation_de", "translation_fr"], "columns_mapping": {"translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-ja": {"config_name": "de-ja", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_de", "translation_ja"], "columns_mapping": {"translation.de": "translation_de", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-ru": {"config_name": "de-ru", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_de", "translation_ru"], "columns_mapping": {"translation.de": "translation_de", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-sv": {"config_name": "de-sv", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_de", "translation_sv"], "columns_mapping": {"translation.de": "translation_de", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "de-zh_CN": {"config_name": "de-zh_CN", "sample_row": "{\"translation.de\": \"\\\"Diagramme in $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_de", "translation_zh_CN"], "columns_mapping": {"translation.de": "translation_de", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-es": {"config_name": "en_GB-es", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\"}", "columns": ["translation_en_GB", "translation_es"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.es": "translation_es"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-fr": {"config_name": "en_GB-fr", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\"}", "columns": ["translation_en_GB", "translation_fr"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.fr": "translation_fr"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-ja": {"config_name": "en_GB-ja", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_en_GB", "translation_ja"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-ru": {"config_name": "en_GB-ru", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_en_GB", "translation_ru"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-sv": {"config_name": "en_GB-sv", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_en_GB", "translation_sv"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "en_GB-zh_CN": {"config_name": "en_GB-zh_CN", "sample_row": "{\"translation.en_GB\": \"\\\"Charts in $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_en_GB", "translation_zh_CN"], "columns_mapping": {"translation.en_GB": "translation_en_GB", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\"}", "columns": ["translation_es", "translation_fr"], "columns_mapping": {"translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-ja": {"config_name": "es-ja", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_es", "translation_ja"], "columns_mapping": {"translation.es": "translation_es", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-ru": {"config_name": "es-ru", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_es", "translation_ru"], "columns_mapping": {"translation.es": "translation_es", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-sv": {"config_name": "es-sv", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_es", "translation_sv"], "columns_mapping": {"translation.es": "translation_es", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "es-zh_CN": {"config_name": "es-zh_CN", "sample_row": "{\"translation.es\": \"\\\"Gr\\\\u00e1ficas en $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_es", "translation_zh_CN"], "columns_mapping": {"translation.es": "translation_es", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-ja": {"config_name": "fr-ja", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\"}", "columns": ["translation_fr", "translation_ja"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ja": "translation_ja"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_fr", "translation_ru"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_fr", "translation_sv"], "columns_mapping": {"translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "fr-zh_CN": {"config_name": "fr-zh_CN", "sample_row": "{\"translation.fr\": \"\\\"Diagrammes dans $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_fr", "translation_zh_CN"], "columns_mapping": {"translation.fr": "translation_fr", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ja-ru": {"config_name": "ja-ru", "sample_row": "{\"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\", \"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\"}", "columns": ["translation_ja", "translation_ru"], "columns_mapping": {"translation.ja": "translation_ja", "translation.ru": "translation_ru"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ja-sv": {"config_name": "ja-sv", "sample_row": "{\"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_ja", "translation_sv"], "columns_mapping": {"translation.ja": "translation_ja", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ja-zh_CN": {"config_name": "ja-zh_CN", "sample_row": "{\"translation.ja\": \"\\\"$[officename] \\\\u306e\\\\u30b0\\\\u30e9\\\\u30d5\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_ja", "translation_zh_CN"], "columns_mapping": {"translation.ja": "translation_ja", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ru-sv": {"config_name": "ru-sv", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\", \"translation.sv\": \"\\\"Diagram i $[officename]\\\"\"}", "columns": ["translation_ru", "translation_sv"], "columns_mapping": {"translation.ru": "translation_ru", "translation.sv": "translation_sv"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "ru-zh_CN": {"config_name": "ru-zh_CN", "sample_row": "{\"translation.ru\": \"\\\"\\\\u0414\\\\u0438\\\\u0430\\\\u0433\\\\u0440\\\\u0430\\\\u043c\\\\u043c\\\\...\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_ru", "translation_zh_CN"], "columns_mapping": {"translation.ru": "translation_ru", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}, "sv-zh_CN": {"config_name": "sv-zh_CN", "sample_row": "{\"translation.sv\": \"\\\"Diagram i $[officename]\\\"\", \"translation.zh_CN\": \"\\\"$[officename] \\\\u4e2d\\\\u7684\\\\u56fe\\\\u8868\\\"\"}", "columns": ["translation_sv", "translation_zh_CN"], "columns_mapping": {"translation.sv": "translation_sv", "translation.zh_CN": "translation_zh_CN"}, "dataset_description": "A collection of documents from http://www.openoffice.org/.\n", "dataset_name": "opus_openoffice"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:ja", "language:ru", "language:sv", "language:zh"], "is_gated": false}, "opus_paracrawl": {"dataset_name": "opus_paracrawl", "description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G", "downloads": 1592, "configs": {"el-en": {"config_name": "el-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"\\\\u03a3\\\\u03a5\\\\u039c\\\\u039c\\\\u0395\\\\u03a4\\\\u039f\\\\u03a7\\\\...\", \"translation.en\": \"\\\"PARTICIPATION IN THE NATIONAL CONFERENCE OF NORTH...\"}", "columns": ["id", "translation_el", "translation_en"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.en": "translation_en"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-km": {"config_name": "en-km", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"INCLUDED IN THIS BOOK ARE SECTIONS 103, 104, 105,...\", \"translation.km\": \"\\\"\\\\u179a\\\\u17bd\\\\u1798 \\\\u1794\\\\u1789\\\\u17d2\\\\u1785\\\\u17bc...\"}", "columns": ["id", "translation_en", "translation_km"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.km": "translation_km"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-so": {"config_name": "en-so", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"11 And the smoke of their torments shall ascend u...\", \"translation.so\": \"\\\"11 Oo qiiqa caddibaaddooda kor buu u baxayaa weli...\"}", "columns": ["id", "translation_en", "translation_so"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.so": "translation_so"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "de-pl": {"config_name": "de-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"126:2.5 (1388.5) Im Laufe der Jahre ma\\\\u00df dies...\", \"translation.pl\": \"\\\"(1388.5) 126:2.5 Z up\\\\u0142ywem lat m\\\\u0142ody ci...\"}", "columns": ["id", "translation_de", "translation_pl"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.pl": "translation_pl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "fr-nl": {"config_name": "fr-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"3 Le sucre peut mettre fin \\\\u00e0 la grossesse?\\\"...\", \"translation.nl\": \"\\\"3 Kan suiker be\\\\u00ebindigen van de zwangerschap?...\"}", "columns": ["id", "translation_fr", "translation_nl"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.nl": "translation_nl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-sw": {"config_name": "en-sw", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"1. We have all sinned and are therefore separated...\", \"translation.sw\": \"\\\"\\\\u2022 Wote tumetenda dhambi na kwa hivyo tumeten...\"}", "columns": ["id", "translation_en", "translation_sw"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sw": "translation_sw"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "en-tl": {"config_name": "en-tl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Because police would not allow amplification syst...\", \"translation.tl\": \"\\\"Dahil ang pulis hindi papayagan ang paglaki mga s...\"}", "columns": ["id", "translation_en", "translation_tl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.tl": "translation_tl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}, "es-gl": {"config_name": "es-gl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"Esta vez el cuadro local con el segundo de la tar...\", \"translation.gl\": \"\\\"Ga\\\\u00f1ou confianza o cadro local co segundo da ...\"}", "columns": ["id", "translation_es", "translation_gl"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.gl": "translation_gl"}, "dataset_description": "Parallel corpora from Web Crawls collected in the ParaCrawl project.\n\n42 languages, 43 bitexts\ntotal number of files: 59,996\ntotal number of tokens: 56.11G\ntotal number of sentence fragments: 3.13G\n", "dataset_name": "opus_paracrawl"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:ca", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:eu", "language:fi", "language:fr", "language:ga", "language:gl", "language:hr", "language:hu", "language:is", "language:it", "language:km", "language:ko", "language:lt", "language:lv", "language:mt", "language:my", "language:nb", "language:ne", "language:nl", "language:nn", "language:pl", "language:pt", "language:ro", "language:ru", "language:si", "language:sk", "language:sl", "language:so", "language:sv", "language:sw", "language:tl", "language:uk", "language:zh"], "is_gated": false}, "opus_rf": {"dataset_name": "opus_rf", "description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.", "downloads": 1554, "configs": {"de-en": {"config_name": "de-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"REGIERUNGSERKL\\\\u00c4RUNG abgegeben vom Ministerpr...\", \"translation.en\": \"\\\"Statement of Government Policy by the Prime Minis...\"}", "columns": ["id", "translation_de", "translation_en"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "de-es": {"config_name": "de-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"MINISTERIUM DES AUSW\\\\u00c4RTIGEN Presseabteilung\\\"...\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\"}", "columns": ["id", "translation_de", "translation_es"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.es": "translation_es"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"REGIERUNGSERKL\\\\u00c4RUNG abgegeben vom Ministerpr...\", \"translation.fr\": \"\\\"Declaration de Politique G\\\\u00e9n\\\\u00e9rale du Go...\"}", "columns": ["id", "translation_de", "translation_fr"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "de-sv": {"config_name": "de-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"REGIERUNGSERKL\\\\u00c4RUNG abgegeben vom Ministerpr...\", \"translation.sv\": \"\\\"REGERINGSF\\\\u00d6RKLARING.\\\"\"}", "columns": ["id", "translation_de", "translation_sv"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "en-es": {"config_name": "en-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"MINISTRY FOR FOREIGN AFFAIRS Press Section Check ...\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\"}", "columns": ["id", "translation_en", "translation_es"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Statement of Government Policy by the Prime Minis...\", \"translation.fr\": \"\\\"Declaration de Politique G\\\\u00e9n\\\\u00e9rale du Go...\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "en-sv": {"config_name": "en-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Statement of Government Policy by the Prime Minis...\", \"translation.sv\": \"\\\"REGERINGSF\\\\u00d6RKLARING.\\\"\"}", "columns": ["id", "translation_en", "translation_sv"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\", \"translation.fr\": \"\\\"MINISTERE DES AFFAIRES \\\\u00c9TRANGERES Service de...\"}", "columns": ["id", "translation_es", "translation_fr"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "es-sv": {"config_name": "es-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"MINISTERIO DE ASUNTOS EXTERIORES Servicio de Pren...\", \"translation.sv\": \"\\\"Fru talman, \\\\u00e4rade ledam\\\\u00f6ter av Sveriges...\"}", "columns": ["id", "translation_es", "translation_sv"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}, "fr-sv": {"config_name": "fr-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"Declaration de Politique G\\\\u00e9n\\\\u00e9rale du Go...\", \"translation.sv\": \"\\\"REGERINGSF\\\\u00d6RKLARING.\\\"\"}", "columns": ["id", "translation_fr", "translation_sv"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.sv": "translation_sv"}, "dataset_description": "RF is a tiny parallel corpus of the Declarations of the Swedish Government and its translations.\n", "dataset_name": "opus_rf"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:sv"], "is_gated": false}, "opus_tedtalks": {"dataset_name": "opus_tedtalks", "description": "This is a Croatian-English parallel corpus of transcribed and translated TED talks, originally extracted from https://wit3.fbk.eu. The corpus is compiled by \u017deljko Agi\u0107 and is taken from http://lt.ffzg.hr/zagic provided under the CC-BY-NC-SA license.\n2 languages, total number of files: 2\ntotal number of tokens: 2.81M\ntotal number of sentence fragments: 0.17M", "downloads": 303, "configs": {"en-hr": {"config_name": "en-hr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"I want you now to imagine a wearable robot that g...\", \"translation.hr\": \"\\\"\\\\u017delim da sada zamislite nosiv robot koji vam...\"}", "columns": ["id", "translation_en", "translation_hr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.hr": "translation_hr"}, "dataset_description": "This is a Croatian-English parallel corpus of transcribed and translated TED talks, originally extracted from https://wit3.fbk.eu. The corpus is compiled by \u017deljko Agi\u0107 and is taken from http://lt.ffzg.hr/zagic provided under the CC-BY-NC-SA license.\n2 languages, total number of files: 2\ntotal number of tokens: 2.81M\ntotal number of sentence fragments: 0.17M\n", "dataset_name": "opus_tedtalks"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:hr"], "is_gated": false}, "opus_ubuntu": {"dataset_name": "opus_ubuntu", "description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M", "downloads": 1555, "configs": {"as-bs": {"config_name": "as-bs", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.as\": \"\\\"Aisleriot \\\\u099a\\\\u09b2\\\\u09bf\\\\u099f\\\\u09c7\\\\u09df\\\\u0...\", \"translation.bs\": \"\\\"AisleRiot pasijans\\\"\"}", "columns": ["id", "translation_as", "translation_bs"], "columns_mapping": {"id": "id", "translation.as": "translation_as", "translation.bs": "translation_bs"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "az-cs": {"config_name": "az-cs", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.az\": \"\\\"Gmail, Google Docs, Google+ , YouTube v\\\\u0259 Pic...\", \"translation.cs\": \"\\\"Obashuje Gmail, Google Docs, Google+, YouTube a P...\"}", "columns": ["id", "translation_az", "translation_cs"], "columns_mapping": {"id": "id", "translation.az": "translation_az", "translation.cs": "translation_cs"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "bg-de": {"config_name": "bg-de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"\\\\u0412\\\\u043a\\\\u043b\\\\u044e\\\\u0447\\\\u0432\\\\u0430 Gmail,...\", \"translation.de\": \"\\\"Umfasst Gmail, Google Docs, Google+, YouTube und ...\"}", "columns": ["id", "translation_bg", "translation_de"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.de": "translation_de"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-es_PR": {"config_name": "br-es_PR", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"N'eo c'hall ket beza\\\\u00f1 an arguzenn %s evit %s...\", \"translation.es_PR\": \"\\\"argumento %s inv\\\\u00e1lido para %s\\\"\"}", "columns": ["id", "translation_br", "translation_es_PR"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.es_PR": "translation_es_PR"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "bn-ga": {"config_name": "bn-ga", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bn\": \"\\\"AisleRiot \\\\u09b8\\\\u09b2\\\\u09bf\\\\u099f\\\\u09c7\\\\u09df\\\\u0...\", \"translation.ga\": \"\\\"Cluiche Aonair AisleRiot\\\"\"}", "columns": ["id", "translation_bn", "translation_ga"], "columns_mapping": {"id": "id", "translation.bn": "translation_bn", "translation.ga": "translation_ga"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-hi": {"config_name": "br-hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Enderc'hel a ra Gmail, Google Docs, Google+, YouT...\", \"translation.hi\": \"\\\"\\\\u0936\\\\u093e\\\\u092e\\\\u093f\\\\u0932 \\\\u0915\\\\u0930\\\\u0924...\"}", "columns": ["id", "translation_br", "translation_hi"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.hi": "translation_hi"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-la": {"config_name": "br-la", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Mat eo\\\"\", \"translation.la\": \"\\\"Bene\\\"\"}", "columns": ["id", "translation_br", "translation_la"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.la": "translation_la"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "bs-szl": {"config_name": "bs-szl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bs\": \"\\\"Uredu\\\"\", \"translation.szl\": \"\\\"OK\\\"\"}", "columns": ["id", "translation_bs", "translation_szl"], "columns_mapping": {"id": "id", "translation.bs": "translation_bs", "translation.szl": "translation_szl"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-uz": {"config_name": "br-uz", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Kas an danevell a-fet kudenno\\\\u00f9 da baotred an...\", \"translation.uz\": \"\\\"\\\\u0422\\\\u0443\\\\u0437\\\\u0443\\\\u0432\\\\u0447\\\\u0438\\\\u043b\\\\...\"}", "columns": ["id", "translation_br", "translation_uz"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.uz": "translation_uz"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}, "br-yi": {"config_name": "br-yi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.br\": \"\\\"Mat eo\\\"\", \"translation.yi\": \"\\\"\\\\u05d2\\\\u05d5\\\\u05d8\\\"\"}", "columns": ["id", "translation_br", "translation_yi"], "columns_mapping": {"id": "id", "translation.br": "translation_br", "translation.yi": "translation_yi"}, "dataset_description": "A parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.net\n244 languages, 23,988 bitexts\ntotal number of files: 30,959\ntotal number of tokens: 29.84M\ntotal number of sentence fragments: 7.73M\n", "dataset_name": "opus_ubuntu"}}, "tags": ["task_categories:translation", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:ace", "language:af", "language:ak", "language:am", "language:an", "language:ang", "language:ar", "language:ary", "language:as", "language:ast", "language:az", "language:ba", "language:bal", "language:be", "language:bem", "language:ber", "language:bg", "language:bho", "language:bn", "language:bo", "language:br", "language:brx", "language:bs", "language:bua", "language:byn", "language:ca", "language:ce", "language:ceb", "language:chr", "language:ckb", "language:co", "language:crh", "language:cs", "language:csb", "language:cv", "language:cy", "language:da", "language:de", "language:dsb", "language:dv", "language:dz", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:ff", "language:fi", "language:fil", "language:fo", "language:fr", "language:frm", "language:frp", "language:fur", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:grc", "language:gu", "language:guc", "language:gv", "language:ha", "language:haw", "language:he", "language:hi", "language:hil", "language:hne", "language:hr", "language:hsb", "language:ht", "language:hu", "language:hy", "language:ia", "language:id", "language:ig", "language:io", "language:is", "language:it", "language:iu", "language:ja", "language:jbo", "language:jv", "language:ka", "language:kab", "language:kg", "language:kk", "language:kl", "language:km", "language:kn", "language:ko", "language:kok", "language:ks", "language:ksh", "language:ku", "language:kw", "language:ky", "language:la", "language:lb", "language:lg", "language:li", "language:lij", "language:lld", "language:ln", "language:lo", "language:lt", "language:ltg", "language:lv", "language:mai", "language:mg", "language:mh", "language:mhr", "language:mi", "language:miq", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:mus", "language:my", "language:nan", "language:nap", "language:nb", "language:nds", "language:ne", "language:nhn", "language:nl", "language:nn", "language:no", "language:nso", "language:ny", "language:oc", "language:om", "language:or", "language:os", "language:pa", "language:pam", "language:pap", "language:pl", "language:pms", "language:pmy", "language:ps", "language:pt", "language:qu", "language:rm", "language:ro", "language:rom", "language:ru", "language:rw", "language:sa", "language:sc", "language:sco", "language:sd", "language:se", "language:shn", "language:shs", "language:si", "language:sk", "language:sl", "language:sm", "language:sml", "language:sn", "language:so", "language:son", "language:sq", "language:sr", "language:st", "language:sv", "language:sw", "language:syr", "language:szl", "language:ta", "language:te", "language:tet", "language:tg", "language:th", "language:ti", "language:tk", "language:tl", "language:tlh", "language:tr", "language:trv", "language:ts", "language:tt", "language:ug", "language:uk", "language:ur", "language:uz", "language:ve", "language:vec", "language:vi", "language:wa", "language:wae", "language:wo", "language:xal", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu", "language:zza"], "is_gated": false}, "opus_wikipedia": {"dataset_name": "opus_wikipedia", "description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M", "downloads": 1114, "configs": {"ar-en": {"config_name": "ar-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ar\": \"\\\"* Encyclopaedia of Mathematics online encyclopaed...\", \"translation.en\": \"\\\"*Encyclopaedia of Mathematics online encyclopaedi...\"}", "columns": ["id", "translation_ar", "translation_en"], "columns_mapping": {"id": "id", "translation.ar": "translation_ar", "translation.en": "translation_en"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "ar-pl": {"config_name": "ar-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ar\": \"\\\"\\\\u0646\\\\u0638\\\\u0627\\\\u0645 \\\\u062a\\\\u0631\\\\u0645\\\\u064a...\", \"translation.pl\": \"\\\"ASCII (czyt.\\\"\"}", "columns": ["id", "translation_ar", "translation_pl"], "columns_mapping": {"id": "id", "translation.ar": "translation_ar", "translation.pl": "translation_pl"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "en-sl": {"config_name": "en-sl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"__NOTOC__Year 886 (DCCCLXXXVI) was a common year ...\", \"translation.sl\": \"\\\"\\\\u017divel je predvsem v Bagdadu.\\\"\"}", "columns": ["id", "translation_en", "translation_sl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.sl": "translation_sl"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Average temperatures on the coast are in January ...\", \"translation.ru\": \"\\\"\\\\u0427\\\\u0435\\\\u0440\\\\u0435\\\\u0437 \\\\u043d\\\\u0435\\\\u0434...\"}", "columns": ["id", "translation_en", "translation_ru"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}, "en-vi": {"config_name": "en-vi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"A. Blackburn (2009).\\\"\", \"translation.vi\": \"\\\"A. Blackburn (2009).\\\"\"}", "columns": ["id", "translation_en", "translation_vi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.vi": "translation_vi"}, "dataset_description": "This is a corpus of parallel sentences extracted from Wikipedia by Krzysztof Wo\u0142k and Krzysztof Marasek. Please cite the following publication if you use the data: Krzysztof Wo\u0142k and Krzysztof Marasek: Building Subject-aligned Comparable Corpora and Mining it for Truly Parallel Sentence Pairs., Procedia Technology, 18, Elsevier, p.126-132, 2014\n20 languages, 36 bitexts\ntotal number of files: 114\ntotal number of tokens: 610.13M\ntotal number of sentence fragments: 25.90M\n", "dataset_name": "opus_wikipedia"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:ar", "language:bg", "language:cs", "language:de", "language:el", "language:en", "language:es", "language:fa", "language:fr", "language:he", "language:hu", "language:it", "language:nl", "language:pl", "language:pt", "language:ro", "language:ru", "language:sl", "language:tr", "language:vi"], "is_gated": false}, "opus_xhosanavy": {"dataset_name": "opus_xhosanavy", "description": "This dataset is designed for machine translation from English to Xhosa.", "downloads": 349, "configs": {"en-xh": {"config_name": "en-xh", "sample_row": "{\"translation.en\": \"\\\"Rope and its Usage\\\"\", \"translation.xh\": \"\\\"Intambo nomsebenzi ewenzayo.\\\"\"}", "columns": ["translation_en", "translation_xh"], "columns_mapping": {"translation.en": "translation_en", "translation.xh": "translation_xh"}, "dataset_description": "This dataset is designed for machine translation from English to Xhosa.", "dataset_name": "opus_xhosanavy"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:xh"], "is_gated": false}, "oscar": {"dataset_name": "oscar", "description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.\\", "downloads": 57575, "configs": {"unshuffled_deduplicated_af": {"config_name": "unshuffled_deduplicated_af", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"aanlyn markte as gevolg van ons voortgesette 'n b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_als": {"config_name": "unshuffled_deduplicated_als", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dr 6. Augschte isch dr 218. Dag vum Gregorianisch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_am": {"config_name": "unshuffled_deduplicated_am", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u12a0\\\\u12e8\\\\u122d \\\\u1218\\\\u1295\\\\u1308\\\\u12f1 \\\\u12a...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_an": {"config_name": "unshuffled_deduplicated_an", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0648\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ar": {"config_name": "unshuffled_deduplicated_ar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0645\\\\u0631\\\\u062d\\\\u0628\\\\u0627 \\\\u0628\\\\u0643 \\\\u063...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_arz": {"config_name": "unshuffled_deduplicated_arz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"..\\\\u064c::\\\\u064c:: \\\\u0627\\\\u0644\\\\u0646\\\\u0633\\\\u0627...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_as": {"config_name": "unshuffled_deduplicated_as", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0986\\\\u09ae\\\\u09bf, \\\\u098f\\\\u0987 \\\\u09b8\\\\u0982\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ast": {"config_name": "unshuffled_deduplicated_ast", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"The Killers llanzaron el so \\\\u00e1lbum deb\\\\u00fa,...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_av": {"config_name": "unshuffled_deduplicated_av", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0438\\\\u043d\\\\u0434\\\\u0430 \\\\u043c\\\\u0430\\\\u043b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_az": {"config_name": "unshuffled_deduplicated_az", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"AZTV-Art\\\\u0131q 7 ildir ki, Ab\\\\u015feron rayonu d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_azb": {"config_name": "unshuffled_deduplicated_azb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0644\\\\u0639\\\\u0644\\\\u06cc \\\\u0661\\\\u0663-\\\\u062c\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ba": {"config_name": "unshuffled_deduplicated_ba", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u04d9\\\\u0441\\\\u0435\\\\u0440 \\\\u043c\\\\u0430\\\\u0442...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bar": {"config_name": "unshuffled_deduplicated_bar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\" ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bcl": {"config_name": "unshuffled_deduplicated_bcl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"& \\\\u00ff \\\\u00f3 / \\\\u00ed 0 - \\\\u00f8 \\\\u00fb \\\\u00f9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_be": {"config_name": "unshuffled_deduplicated_be", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0440\\\\u044d\\\\u0441\\\\u0446\\\\u043a\\\\u0456\\\\u044f ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bg": {"config_name": "unshuffled_deduplicated_bg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0410\\\\u041b\\\\u0411\\\\u041e\\\\u041f\\\\u041e\\\\u0414\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bh": {"config_name": "unshuffled_deduplicated_bh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0908 \\\\u0938\\\\u0947\\\\u0939\\\\u0924 \\\\u0906 \\\\u0938\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bn": {"config_name": "unshuffled_deduplicated_bn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09ad\\\\u09dc\\\\u0982 \\\\u09b8\\\\u09b0\\\\u09cd\\\\u09ac\\\\u09b8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bo": {"config_name": "unshuffled_deduplicated_bo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0f56\\\\u0f7c\\\\u0f51\\\\u0f0b\\\\u0f58\\\\u0f72\\\\u0f0b\\\\u0f60\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bpy": {"config_name": "unshuffled_deduplicated_bpy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09aa\\\\u09cc\\\\u09b0\\\\u09b8\\\\u09ad\\\\u09be \\\\u098f\\\\u09b9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_br": {"config_name": "unshuffled_deduplicated_br", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Ar mank Magalh\\\\u00e3es(Daveo\\\\u00f9 a vank) a zo u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bs": {"config_name": "unshuffled_deduplicated_bs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u017e \\\\u0161\\\\u0159 \\\\u00e9 \\\\u00fa \\\\u0161\\\\u0159 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_bxr": {"config_name": "unshuffled_deduplicated_bxr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0421\\\\u0430\\\\u0433\\\\u0430\\\\u0430\\\\u043d h\\\\u0430\\\\u044...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ca": {"config_name": "unshuffled_deduplicated_ca", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Daniel Vendrell, conegut com Vandrell, ha sigut u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cbk": {"config_name": "unshuffled_deduplicated_cbk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"yo gano yo gano yo gano yo gano yo gano yo gano y...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ce": {"config_name": "unshuffled_deduplicated_ce", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0413\\\\u0440\\\\u0435\\\\u043d\\\\u043b\\\\u0430\\\\u043d\\\\u0434\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ceb": {"config_name": "unshuffled_deduplicated_ceb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Si Isko walay pupamilok nga nagtan-aw sa unahan, ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ckb": {"config_name": "unshuffled_deduplicated_ckb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0631\\\\u0633\\\\u06cc \\\\u0631\\\\u06c6\\\\u0698 - \\\\u0633\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cs": {"config_name": "unshuffled_deduplicated_cs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oran\\\\u017eovou stuhu 2018 z\\\\u00edskala od Ministe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cv": {"config_name": "unshuffled_deduplicated_cv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u044b\\\\u0440\\\\u0430\\\\u043d\\\\u04d1 \\\\u0447\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_cy": {"config_name": "unshuffled_deduplicated_cy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mae capeli Cymreig yr Andes ym Mhatagonia wedi cy...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_da": {"config_name": "unshuffled_deduplicated_da", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Den 2.-5. februar 2016 l\\\\u00f8b det tredje kursus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_de": {"config_name": "unshuffled_deduplicated_de", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dosierf\\\\u00f6rderb\\\\u00e4nder Getriebe Entw\\\\u00e4s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_diq": {"config_name": "unshuffled_deduplicated_diq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Z\\\\u0131wan\\\\u00ea Slawki, z\\\\u0131wano merduman\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_dsb": {"config_name": "unshuffled_deduplicated_dsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zg\\\\u00f3\\\\u0144\\\\u015bo w\\\\u011bcej w\\\\u00f3 l\\\\u011bp...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_dv": {"config_name": "unshuffled_deduplicated_dv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0784. \\\\u0787\\\\u07a6\\\\u078c\\\\u07ae\\\\u0785\\\\u07aa\\\\u078...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_el": {"config_name": "unshuffled_deduplicated_el", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u039d\\\\u03b5\\\\u03ba\\\\u03c1\\\\u03cc\\\\u03c2 \\\\u03b5\\\\u03bd...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_eml": {"config_name": "unshuffled_deduplicated_eml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"A s\\\\u00e9guit dal pruc\\\\u00e8s ad rubuti\\\\u015basi\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_en": {"config_name": "unshuffled_deduplicated_en", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mtendere Village was inspired by the vision of Ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_eo": {"config_name": "unshuffled_deduplicated_eo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0108u ... pre\\\\u011di | mediti | ricevi instigoj...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_es": {"config_name": "unshuffled_deduplicated_es", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Como se librar\\\\u00e1 de la celulitis en el gimnas...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_et": {"config_name": "unshuffled_deduplicated_et", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"MT\\\\u00dc AB Video j\\\\u00e4rgib oma tegevuses kodan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_eu": {"config_name": "unshuffled_deduplicated_eu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Gure jarduerek eraikuntzarekin, elkarbizitzarekin...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fa": {"config_name": "unshuffled_deduplicated_fa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0627\\\\u0645\\\\u0634\\\\u0628 \\\\u0628\\\\u0627\\\\u0631\\\\u0648...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fi": {"config_name": "unshuffled_deduplicated_fi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Luokka Kauniita tytt\\\\u00f6j\\\\u00e4, Teini, Porno p...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fr": {"config_name": "unshuffled_deduplicated_fr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"M\\\\u00e9dia de d\\\\u00e9bat d'id\\\\u00e9es, de culture...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_frr": {"config_name": "unshuffled_deduplicated_frr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hiragana\\\\u2019 Practice\\\\u2019Sheet\\\\u20191\\\\u2019(A...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_fy": {"config_name": "unshuffled_deduplicated_fy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Veen, Klaas F. van der et al1984-2011Wurdboek fan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ga": {"config_name": "unshuffled_deduplicated_ga", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Is f\\\\u00f3ram \\\\u00e9 seo chun pl\\\\u00e9 a dh\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gd": {"config_name": "unshuffled_deduplicated_gd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zhou Yujun, a 'ph\\\\u00e0rtaidh R\\\\u00f9naire Comata...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gl": {"config_name": "unshuffled_deduplicated_gl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"O persoal de Inditex da provincia de Pontevedra s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gn": {"config_name": "unshuffled_deduplicated_gn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oiko pete\\\\u0129 kirir\\\\u0129 \\\\u00f1emond\\\\u00fdi pe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gom": {"config_name": "unshuffled_deduplicated_gom", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u091c\\\\u093e\\\\u0915\\\\u0902\\\\u0920\\\\u0940\\\\u0902\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_gu": {"config_name": "unshuffled_deduplicated_gu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a85\\\\u0aa7\\\\u0abf\\\\u0a95 \\\\u0aae\\\\u0abe\\\\u0ab8 \\\\u0a9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_he": {"config_name": "unshuffled_deduplicated_he", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05d6\\\\u05e7\\\\u05d5\\\\u05e7\\\\u05d9\\\\u05dd \\\\u05dc\\\\u05e8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hi": {"config_name": "unshuffled_deduplicated_hi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'\\\\u0906\\\\u0907\\\\u091f\\\\u092e \\\\u0917\\\\u0930\\\\u094d\\\\u093...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hr": {"config_name": "unshuffled_deduplicated_hr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"U raspravi je sudjelovao i HSS-ov saborski zastup...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hsb": {"config_name": "unshuffled_deduplicated_hsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Budy\\\\u0161in (SN/B\\\\u0160e). Elektronikarjo m\\\\u011...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ht": {"config_name": "unshuffled_deduplicated_ht", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u043d\\\\u0430\\\\u0447\\\\u0430\\\\u0442\\\\u044c us $ nan us ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hu": {"config_name": "unshuffled_deduplicated_hu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"monster - Amat\\\\u0151r, h\\\\u00e1zi szex vide\\\\u00f3k...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_hy": {"config_name": "unshuffled_deduplicated_hy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0531\\\\u0580\\\\u0581\\\\u0561\\\\u056d\\\\u056b \\\\u0540\\\\u0561...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ia": {"config_name": "unshuffled_deduplicated_ia", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha h...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_id": {"config_name": "unshuffled_deduplicated_id", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"~pic by cetusanminda. Marhalah yang sering disebu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ie": {"config_name": "unshuffled_deduplicated_ie", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Plastic Yo Yo Metal Yo Yos Wooden Yo Yo Keychain ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ilo": {"config_name": "unshuffled_deduplicated_ilo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Segun ken ni Ping-ay, ti yellow corn ti maysa kad...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_io": {"config_name": "unshuffled_deduplicated_io", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Chekia esas parlamentala republiko. La chefo di s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_is": {"config_name": "unshuffled_deduplicated_is", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Eyjar.net - uppl\\\\u00fdsinga- og fr\\\\u00e9ttami\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_it": {"config_name": "unshuffled_deduplicated_it", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La estrazione numero 48 del 10 e LOTTO ogni 5 min...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ja": {"config_name": "unshuffled_deduplicated_ja", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u795e\\\\u793e\\\\u306a\\\\u3069\\\\u3078\\\\u4e00\\\\u7dd2\\\\u306b\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_jbo": {"config_name": "unshuffled_deduplicated_jbo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"lo ni lo vacri cu glare cu banzuni lo nu ro da po...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_jv": {"config_name": "unshuffled_deduplicated_jv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Jos\\\\u00e9 Mourinho (diwaca: [\\\\u0292u\\\\u02c8z\\\\u025b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ka": {"config_name": "unshuffled_deduplicated_ka", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10ec\\\\u10d0\\\\u10db\\\\u10d8\\\\u10e7\\\\u10d5\\\\u10d0\\\\u10dc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kk": {"config_name": "unshuffled_deduplicated_kk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0422\\\\u04af\\\\u043b\\\\u043a\\\\u0456\\\\u0431\\\\u0430\\\\u0441 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_km": {"config_name": "unshuffled_deduplicated_km", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u1781\\\\u17d2\\\\u179f\\\\u17b9\\\\u1794\\\\u178a\\\\u17b6\\\\u1780\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kn": {"config_name": "unshuffled_deduplicated_kn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0caa\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ko": {"config_name": "unshuffled_deduplicated_ko", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"CIA \\\\ud504\\\\ub85c\\\\uc81d\\\\ud2b8\\\\uc5d0\\\\uc11c\\\\ub294 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_krc": {"config_name": "unshuffled_deduplicated_krc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u0430\\\\u043c\\\\u0445\\\\u0430\\\\u043d\\\\u043b\\\\u0430\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ku": {"config_name": "unshuffled_deduplicated_ku", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"R\\\\u00eaxistina maf\\\\u00ean mirovan Freedom House r...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kv": {"config_name": "unshuffled_deduplicated_kv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u043e\\\\u043c\\\\u0438 \\\\u043a\\\\u044b\\\\u0442\\\\u0448...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_kw": {"config_name": "unshuffled_deduplicated_kw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ky": {"config_name": "unshuffled_deduplicated_ky", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Turmush: \\\\u0411\\\\u0438\\\\u0448\\\\u043a\\\\u0435\\\\u043a \\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_la": {"config_name": "unshuffled_deduplicated_la", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"H\\\\u00e6 sunt generationes No\\\\u00eb: No\\\\u00eb vir ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lb": {"config_name": "unshuffled_deduplicated_lb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"W\\\\u00e9i all Joers ass d'Fuesend nees eng m\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lez": {"config_name": "unshuffled_deduplicated_lez", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0435\\\\u0441, \\\\u043b\\\\u0435\\\\u0437\\\\u0433\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_li": {"config_name": "unshuffled_deduplicated_li", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'t Good Goedenraad aan de Ezerbaek besjteit oet '...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lmo": {"config_name": "unshuffled_deduplicated_lmo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Calvens\\\\u00e0 l'\\\\u00e8 a 24 km de la sit\\\\u00e0 e ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lo": {"config_name": "unshuffled_deduplicated_lo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0ea7\\\\u0eb5\\\\u200b\\\\u0ec2\\\\u0ead\\\\u200b\\\\u0ec0\\\\u0ead\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lrc": {"config_name": "unshuffled_deduplicated_lrc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0631\\\\u0644\\\\u06cc\\\\u0646\\\\u06af\\\\u062a\\\\u0648\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lt": {"config_name": "unshuffled_deduplicated_lt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0160i programa pad\\\\u0117s geriau i\\\\u0161mokti i...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_lv": {"config_name": "unshuffled_deduplicated_lv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Divdesmit pirmaj\\\\u0101 apr\\\\u012bl\\\\u012b m\\\\u016bsu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mai": {"config_name": "unshuffled_deduplicated_mai", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u096e \\\\u0938\\\\u093f\\\\u0924\\\\u092e\\\\u094d\\\\u092c\\\\u0930...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mg": {"config_name": "unshuffled_deduplicated_mg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hijery ny Tenim-Pirenena rehetra? Mandika ny tant...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mhr": {"config_name": "unshuffled_deduplicated_mhr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u043a\\\\u0440\\\\u0435\\\\u0442 \\\\u0436\\\\u0430\\\\u043f...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_min": {"config_name": "unshuffled_deduplicated_min", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"holaholaholaholaholaholaholaholaholaholaholaholah...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mk": {"config_name": "unshuffled_deduplicated_mk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201e\\\\u0424\\\\u0438\\\\u043b\\\\u043c \\\\u043f\\\\u043b\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ml": {"config_name": "unshuffled_deduplicated_ml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0d05\\\\u0d38\\\\u0d2d\\\\u0d4d\\\\u0d2f\\\\u0d35\\\\u0d41\\\\u0d02 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mn": {"config_name": "unshuffled_deduplicated_mn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041c\\\\u0423\\\\u0411\\\\u0418\\\\u0421-\\\\u044b\\\\u043d \\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mr": {"config_name": "unshuffled_deduplicated_mr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Home / motivational marathi story / \\\\u0909\\\\u0926\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mrj": {"config_name": "unshuffled_deduplicated_mrj", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041b\\\\u04f9\\\\u043f\\\\u04f9\\\\u0432\\\\u043b\\\\u04d3 (\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ms": {"config_name": "unshuffled_deduplicated_ms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Suhaib memang antara orang yang aktif berprogram....\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mt": {"config_name": "unshuffled_deduplicated_mt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"tibg\\\\u0127at il-kaw\\\\u017ca lura lill-Qorti \\\\u0120...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mwl": {"config_name": "unshuffled_deduplicated_mwl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Deciplina social i out\\\\u00f3noma que angloba ateb...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_my": {"config_name": "unshuffled_deduplicated_my", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u103b\\\\u1019\\\\u1040\\\\u1010\\\\u102e - \\\\u101b\\\\u1014\\\\u10...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_myv": {"config_name": "unshuffled_deduplicated_myv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0427\\\\u0430\\\\u0447\\\\u0441\\\\u044c 1914 \\\\u0443\\\\u043c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_mzn": {"config_name": "unshuffled_deduplicated_mzn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0642\\\\u0631\\\\u0622\\\\u0646 \\\\u06cc\\\\u0627 \\\\u0642\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nah": {"config_name": "unshuffled_deduplicated_nah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"In m\\\\u0101cu\\\\u012blp\\\\u014dhualxihuitl VI (inic ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nap": {"config_name": "unshuffled_deduplicated_nap", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00f2 AUDIT \\\\u00ed \\\\u00c7 \\\\u00e8 \\\\u00ee \\\\u00ff \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nds": {"config_name": "unshuffled_deduplicated_nds", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dor kann sik vun nu af an de hele plattd\\\\u00fc\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ne": {"config_name": "unshuffled_deduplicated_ne", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u092c\\\\u0930\\\\u094d\\\\u0926\\\\u093f\\\\u092c\\\\u093e\\\\u0938 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_new": {"config_name": "unshuffled_deduplicated_new", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0930\\\\u093e\\\\u0917 \\\\u0938\\\\u0941\\\\u0939\\\\u093e \\\\u091...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nl": {"config_name": "unshuffled_deduplicated_nl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op vrijdag 31 augustus wordt het nieuwe studiejaa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_nn": {"config_name": "unshuffled_deduplicated_nn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Planomtale krav til innhald Bakgrunn: Sp\\\\u00f8rsm...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_no": {"config_name": "unshuffled_deduplicated_no", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Slett midlertidige internett filer og informasjon...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_oc": {"config_name": "unshuffled_deduplicated_oc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"jizzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_or": {"config_name": "unshuffled_deduplicated_or", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0b2d\\\\u0b41\\\\u0b2c\\\\u0b28\\\\u0b47\\\\u0b36\\\\u0b4d\\\\u0b71\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_os": {"config_name": "unshuffled_deduplicated_os", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1. \\\\u041b\\\\u00e6\\\\u043f\\\\u043f\\\\u0443 \\\\u00e6\\\\u043c\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pa": {"config_name": "unshuffled_deduplicated_pa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a30\\\\u0a1c\\\\u0a3f: \\\\u0a28\\\\u0a70: PB/JL-138/2018-...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pam": {"config_name": "unshuffled_deduplicated_pam", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c1ku pu i Anak ning Al\\\\u00e1ya at ngeni ip\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pl": {"config_name": "unshuffled_deduplicated_pl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Wszyscy producenci Alkazar Opole Biuro Wydawnicze...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pms": {"config_name": "unshuffled_deduplicated_pms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dolina (an sloven; San Dorligo della Valle an ita...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pnb": {"config_name": "unshuffled_deduplicated_pnb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0641\\\\u0631\\\\u06cc\\\\u0646\\\\u06a9 \\\\u0628\\\\u0644\\\\u0646...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ps": {"config_name": "unshuffled_deduplicated_ps", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Many people usually use the time period \\\\u2018bus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_pt": {"config_name": "unshuffled_deduplicated_pt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Voc\\\\u00ea pode estar lendo este texto no sof\\\\u00e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_qu": {"config_name": "unshuffled_deduplicated_qu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Topeka nisqa llaqtaqa, Kansas suyup, Hukllachasqa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_rm": {"config_name": "unshuffled_deduplicated_rm", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"2. secziun Elavuraziun da datas e protecziun da d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ro": {"config_name": "unshuffled_deduplicated_ro", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201c\\\\u00cen via\\\\u021b\\\\u0103, oportunitatea nu e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ru": {"config_name": "unshuffled_deduplicated_ru", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0415\\\\u0441\\\\u043b\\\\u0438 \\\\u0432\\\\u0430\\\\u0448\\\\u0438...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sa": {"config_name": "unshuffled_deduplicated_sa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u0928\\\\u093f\\\\u0930\\\\u0941\\\\u0926\\\\u094d\\\\u0927\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sah": {"config_name": "unshuffled_deduplicated_sah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_scn": {"config_name": "unshuffled_deduplicated_scn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La gilus\\\\u00eca \\\\u00e8 nu sintimentu dulurusu ca ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sd": {"config_name": "unshuffled_deduplicated_sd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0647\\\\u0631 \\\\u06aa\\\\u0648 \\\\u0684\\\\u0627\\\\u06bb\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sh": {"config_name": "unshuffled_deduplicated_sh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op\\\\u0161tina Gornja Radgona se nalazi u sjeverois...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_si": {"config_name": "unshuffled_deduplicated_si", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0da2\\\\u0db1\\\\u0dcf\\\\u0db0\\\\u0dd2\\\\u0db4\\\\u0dad\\\\u0dd2\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sk": {"config_name": "unshuffled_deduplicated_sk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Aktivity | Agent\\\\u00fara podporovan\\\\u00e9ho zames...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sl": {"config_name": "unshuffled_deduplicated_sl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u010ce Creatures, ki je \\\\u017eelel, da pridejo n...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_so": {"config_name": "unshuffled_deduplicated_so", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sq": {"config_name": "unshuffled_deduplicated_sq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c7far\\\\u00eb do t\\\\u00eb m\\\\u00eb p\\\\u00eblqente ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sr": {"config_name": "unshuffled_deduplicated_sr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*\\\\u041e\\\\u0432\\\\u0430 \\\\u043f\\\\u043e\\\\u0440\\\\u0443\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_su": {"config_name": "unshuffled_deduplicated_su", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Lalaki n\\\\u00e9mbongkeun kakuatan jeung vigor jeun...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sv": {"config_name": "unshuffled_deduplicated_sv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1783 \\\\u00e4r ett viktigt \\\\u00e5rtal i den nya tid...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_sw": {"config_name": "unshuffled_deduplicated_sw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zarif: Iran inajua mpango wa Saudia wa kufanya ma...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ta": {"config_name": "unshuffled_deduplicated_ta", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0baa\\\\u0bca\\\\u0bb4\\\\u0bc1\\\\u0ba4\\\\u0bc1 \\\\u0b9a\\\\u0bbe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_te": {"config_name": "unshuffled_deduplicated_te", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0c39\\\\u0c30\\\\u0c4d\\\\u0c2f\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c32\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tg": {"config_name": "unshuffled_deduplicated_tg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u04b2\\\\u0443\\\\u043c\\\\u0430\\\\u0439\\\\u0440\\\\u043e \\\\u0433...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_th": {"config_name": "unshuffled_deduplicated_th", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0e1f\\\\u0e31\\\\u0e19\\\\u0e17\\\\u0e35\\\\u0e48\\\\u0e41\\\\u0e25\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tk": {"config_name": "unshuffled_deduplicated_tk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Amerikany\\\\u0148 Kaliforni\\\\u00fda \\\\u015ftatyndaky ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tl": {"config_name": "unshuffled_deduplicated_tl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Warning Signs na Sira ang Kidneys o Bato - ni Doc...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tr": {"config_name": "unshuffled_deduplicated_tr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Son y\\\\u0131llarda g\\\\u00f6r\\\\u00fclen ay tutulmalar...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tt": {"config_name": "unshuffled_deduplicated_tt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\\\\"\\\\u0418\\\\u0440\\\\u0435\\\\u043c\\\\u043d\\\\u0435\\\\u04a3 \\\\u04...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_tyv": {"config_name": "unshuffled_deduplicated_tyv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u042d\\\\u043a\\\\u0438\\\\u0438, \\\\u0445\\\\u04af\\\\u043d\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ug": {"config_name": "unshuffled_deduplicated_ug", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0632\\\\u0627\\\\u06ad-\\\\u0621\\\\u062a\\\\u06c7\\\\u0632\\\\u0649...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_uk": {"config_name": "unshuffled_deduplicated_uk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041f\\\\u0440\\\\u043e \\\\u043d\\\\u0430\\\\u0434\\\\u0430\\\\u043d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_ur": {"config_name": "unshuffled_deduplicated_ur", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0626\\\\u06cc\\\\u06d2 \\\\u0627\\\\u06c1\\\\u0645 \\\\u062...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_uz": {"config_name": "unshuffled_deduplicated_uz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Markazi Sariosiyo shaharchasi. 1926-yil 29-sentab...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_vec": {"config_name": "unshuffled_deduplicated_vec", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Par ogni p\\\\u00f3nto, \\\\u0142a derivada \\\\u0142a xe ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_vi": {"config_name": "unshuffled_deduplicated_vi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Canh chua c\\\\u00e1 b\\\\u00f4ng lau kh\\\\u00f4ng ch\\\\u1e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_vo": {"config_name": "unshuffled_deduplicated_vo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mettweiler binon zif in fedal\\\\u00e4n: Rheinland-P...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_wa": {"config_name": "unshuffled_deduplicated_wa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ci n' est n\\\\u00e9n l' viyaedje lu-minme ki sait e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_war": {"config_name": "unshuffled_deduplicated_war", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"An Tajan amo in usa ka komyun ha departamento han...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_wuu": {"config_name": "unshuffled_deduplicated_wuu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u4f0a15 [I] | \\\\u4f0a17 | \\\\u4f0a19 | \\\\u4f0a21 | \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_xal": {"config_name": "unshuffled_deduplicated_xal", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u0440\\\\u043d\\\\u0433\\\\u0443\\\\u0434\\\\u0438\\\\u043d ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_xmf": {"config_name": "unshuffled_deduplicated_xmf", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10db\\\\u10dd\\\\u10e9\\\\u10d0\\\\u10db\\\\u10d8\\\\u10da\\\\u10d8 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_yi": {"config_name": "unshuffled_deduplicated_yi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9\\\\u05d5\\\\u05ea\\\\u05d3\\\\u05d9\\\\u05e7 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_yo": {"config_name": "unshuffled_deduplicated_yo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Copyright \\\\u00a9 2018 BBC. BBC k\\\\u00f2 m\\\\u1ecd\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_yue": {"config_name": "unshuffled_deduplicated_yue", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*hughughughughughughughughughughughughughughughug...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_deduplicated_zh": {"config_name": "unshuffled_deduplicated_zh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u65f6\\\\u95f4\\\\u53ef\\\\u4ee5\\\\u88ab\\\\u7f29\\\\u77ed\\\\uff0c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_af": {"config_name": "unshuffled_original_af", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"aanlyn markte as gevolg van ons voortgesette 'n b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_als": {"config_name": "unshuffled_original_als", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dr 6. Augschte isch dr 218. Dag vum Gregorianisch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_am": {"config_name": "unshuffled_original_am", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u12a0\\\\u12e8\\\\u122d \\\\u1218\\\\u1295\\\\u1308\\\\u12f1 \\\\u12a...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_an": {"config_name": "unshuffled_original_an", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0648\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\u0627\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ar": {"config_name": "unshuffled_original_ar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0645\\\\u0631\\\\u062d\\\\u0628\\\\u0627 \\\\u0628\\\\u0643 \\\\u063...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_arz": {"config_name": "unshuffled_original_arz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"..\\\\u064c::\\\\u064c:: \\\\u0627\\\\u0644\\\\u0646\\\\u0633\\\\u0627...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_as": {"config_name": "unshuffled_original_as", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0986\\\\u09ae\\\\u09bf, \\\\u098f\\\\u0987 \\\\u09b8\\\\u0982\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ast": {"config_name": "unshuffled_original_ast", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"The Killers llanzaron el so \\\\u00e1lbum deb\\\\u00fa,...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_av": {"config_name": "unshuffled_original_av", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0438\\\\u043d\\\\u0434\\\\u0430 \\\\u043c\\\\u0430\\\\u043b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_az": {"config_name": "unshuffled_original_az", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"AZTV-Art\\\\u0131q 7 ildir ki, Ab\\\\u015feron rayonu d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_azb": {"config_name": "unshuffled_original_azb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0644\\\\u0639\\\\u0644\\\\u06cc \\\\u0661\\\\u0663-\\\\u062c\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ba": {"config_name": "unshuffled_original_ba", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u04d9\\\\u0441\\\\u0435\\\\u0440 \\\\u043c\\\\u0430\\\\u0442...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bar": {"config_name": "unshuffled_original_bar", "sample_row": "{\"id\": \"0\", \"text\": \"\\\" ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bcl": {"config_name": "unshuffled_original_bcl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"& \\\\u00ff \\\\u00f3 / \\\\u00ed 0 - \\\\u00f8 \\\\u00fb \\\\u00f9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_be": {"config_name": "unshuffled_original_be", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0440\\\\u044d\\\\u0441\\\\u0446\\\\u043a\\\\u0456\\\\u044f ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bg": {"config_name": "unshuffled_original_bg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0416\\\\u0410\\\\u041b\\\\u0411\\\\u041e\\\\u041f\\\\u041e\\\\u0414\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bh": {"config_name": "unshuffled_original_bh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0908 \\\\u0938\\\\u0947\\\\u0939\\\\u0924 \\\\u0906 \\\\u0938\\\\u09...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bn": {"config_name": "unshuffled_original_bn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09ad\\\\u09dc\\\\u0982 \\\\u09b8\\\\u09b0\\\\u09cd\\\\u09ac\\\\u09b8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bo": {"config_name": "unshuffled_original_bo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0f56\\\\u0f7c\\\\u0f51\\\\u0f0b\\\\u0f58\\\\u0f72\\\\u0f0b\\\\u0f60\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bpy": {"config_name": "unshuffled_original_bpy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u09aa\\\\u09cc\\\\u09b0\\\\u09b8\\\\u09ad\\\\u09be \\\\u098f\\\\u09b9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_br": {"config_name": "unshuffled_original_br", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Ar mank Magalh\\\\u00e3es(Daveo\\\\u00f9 a vank) a zo u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bs": {"config_name": "unshuffled_original_bs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u017e \\\\u0161\\\\u0159 \\\\u00e9 \\\\u00fa \\\\u0161\\\\u0159 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_bxr": {"config_name": "unshuffled_original_bxr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0421\\\\u0430\\\\u0433\\\\u0430\\\\u0430\\\\u043d h\\\\u0430\\\\u044...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ca": {"config_name": "unshuffled_original_ca", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Daniel Vendrell, conegut com Vandrell, ha sigut u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cbk": {"config_name": "unshuffled_original_cbk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"yo gano yo gano yo gano yo gano yo gano yo gano y...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ce": {"config_name": "unshuffled_original_ce", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0413\\\\u0440\\\\u0435\\\\u043d\\\\u043b\\\\u0430\\\\u043d\\\\u0434\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ceb": {"config_name": "unshuffled_original_ceb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Si Isko walay pupamilok nga nagtan-aw sa unahan, ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ckb": {"config_name": "unshuffled_original_ckb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0631\\\\u0633\\\\u06cc \\\\u0631\\\\u06c6\\\\u0698 - \\\\u0633\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cs": {"config_name": "unshuffled_original_cs", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oran\\\\u017eovou stuhu 2018 z\\\\u00edskala od Ministe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cv": {"config_name": "unshuffled_original_cv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u044b\\\\u0440\\\\u0430\\\\u043d\\\\u04d1 \\\\u0447\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_cy": {"config_name": "unshuffled_original_cy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mae capeli Cymreig yr Andes ym Mhatagonia wedi cy...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_da": {"config_name": "unshuffled_original_da", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Den 2.-5. februar 2016 l\\\\u00f8b det tredje kursus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_de": {"config_name": "unshuffled_original_de", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dosierf\\\\u00f6rderb\\\\u00e4nder Getriebe Entw\\\\u00e4s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_diq": {"config_name": "unshuffled_original_diq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Z\\\\u0131wan\\\\u00ea Slawki, z\\\\u0131wano merduman\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_dsb": {"config_name": "unshuffled_original_dsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zg\\\\u00f3\\\\u0144\\\\u015bo w\\\\u011bcej w\\\\u00f3 l\\\\u011bp...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_dv": {"config_name": "unshuffled_original_dv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0784. \\\\u0787\\\\u07a6\\\\u078c\\\\u07ae\\\\u0785\\\\u07aa\\\\u078...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_el": {"config_name": "unshuffled_original_el", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u039d\\\\u03b5\\\\u03ba\\\\u03c1\\\\u03cc\\\\u03c2 \\\\u03b5\\\\u03bd...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_eml": {"config_name": "unshuffled_original_eml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"A s\\\\u00e9guit dal pruc\\\\u00e8s ad rubuti\\\\u015basi\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_en": {"config_name": "unshuffled_original_en", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mtendere Village was inspired by the vision of Ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_eo": {"config_name": "unshuffled_original_eo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0108u ... pre\\\\u011di | mediti | ricevi instigoj...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_es": {"config_name": "unshuffled_original_es", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Como se librar\\\\u00e1 de la celulitis en el gimnas...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_et": {"config_name": "unshuffled_original_et", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"MT\\\\u00dc AB Video j\\\\u00e4rgib oma tegevuses kodan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_eu": {"config_name": "unshuffled_original_eu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Gure jarduerek eraikuntzarekin, elkarbizitzarekin...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fa": {"config_name": "unshuffled_original_fa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0627\\\\u0645\\\\u0634\\\\u0628 \\\\u0628\\\\u0627\\\\u0631\\\\u0648...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fi": {"config_name": "unshuffled_original_fi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Luokka Kauniita tytt\\\\u00f6j\\\\u00e4, Teini, Porno p...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fr": {"config_name": "unshuffled_original_fr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"M\\\\u00e9dia de d\\\\u00e9bat d'id\\\\u00e9es, de culture...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_frr": {"config_name": "unshuffled_original_frr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hiragana\\\\u2019 Practice\\\\u2019Sheet\\\\u20191\\\\u2019(A...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_fy": {"config_name": "unshuffled_original_fy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Veen, Klaas F. van der et al1984-2011Wurdboek fan...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ga": {"config_name": "unshuffled_original_ga", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Is f\\\\u00f3ram \\\\u00e9 seo chun pl\\\\u00e9 a dh\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gd": {"config_name": "unshuffled_original_gd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zhou Yujun, a 'ph\\\\u00e0rtaidh R\\\\u00f9naire Comata...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gl": {"config_name": "unshuffled_original_gl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"O persoal de Inditex da provincia de Pontevedra s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gn": {"config_name": "unshuffled_original_gn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Oiko pete\\\\u0129 kirir\\\\u0129 \\\\u00f1emond\\\\u00fdi pe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gom": {"config_name": "unshuffled_original_gom", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u091c\\\\u093e\\\\u0915\\\\u0902\\\\u0920\\\\u0940\\\\u0902\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_gu": {"config_name": "unshuffled_original_gu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a85\\\\u0aa7\\\\u0abf\\\\u0a95 \\\\u0aae\\\\u0abe\\\\u0ab8 \\\\u0a9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_he": {"config_name": "unshuffled_original_he", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05d6\\\\u05e7\\\\u05d5\\\\u05e7\\\\u05d9\\\\u05dd \\\\u05dc\\\\u05e8...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hi": {"config_name": "unshuffled_original_hi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'\\\\u0906\\\\u0907\\\\u091f\\\\u092e \\\\u0917\\\\u0930\\\\u094d\\\\u093...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hr": {"config_name": "unshuffled_original_hr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"U raspravi je sudjelovao i HSS-ov saborski zastup...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hsb": {"config_name": "unshuffled_original_hsb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Budy\\\\u0161in (SN/B\\\\u0160e). Elektronikarjo m\\\\u011...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ht": {"config_name": "unshuffled_original_ht", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u043d\\\\u0430\\\\u0447\\\\u0430\\\\u0442\\\\u044c us $ nan us ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hu": {"config_name": "unshuffled_original_hu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"monster - Amat\\\\u0151r, h\\\\u00e1zi szex vide\\\\u00f3k...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_hy": {"config_name": "unshuffled_original_hy", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0531\\\\u0580\\\\u0581\\\\u0561\\\\u056d\\\\u056b \\\\u0540\\\\u0561...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ia": {"config_name": "unshuffled_original_ia", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha ha h...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_id": {"config_name": "unshuffled_original_id", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"~pic by cetusanminda. Marhalah yang sering disebu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ie": {"config_name": "unshuffled_original_ie", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Plastic Yo Yo Metal Yo Yos Wooden Yo Yo Keychain ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ilo": {"config_name": "unshuffled_original_ilo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Segun ken ni Ping-ay, ti yellow corn ti maysa kad...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_io": {"config_name": "unshuffled_original_io", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Chekia esas parlamentala republiko. La chefo di s...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_is": {"config_name": "unshuffled_original_is", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Eyjar.net - uppl\\\\u00fdsinga- og fr\\\\u00e9ttami\\\\u00...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_it": {"config_name": "unshuffled_original_it", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La estrazione numero 48 del 10 e LOTTO ogni 5 min...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ja": {"config_name": "unshuffled_original_ja", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u795e\\\\u793e\\\\u306a\\\\u3069\\\\u3078\\\\u4e00\\\\u7dd2\\\\u306b\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_jbo": {"config_name": "unshuffled_original_jbo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"lo ni lo vacri cu glare cu banzuni lo nu ro da po...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_jv": {"config_name": "unshuffled_original_jv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Jos\\\\u00e9 Mourinho (diwaca: [\\\\u0292u\\\\u02c8z\\\\u025b...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ka": {"config_name": "unshuffled_original_ka", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10ec\\\\u10d0\\\\u10db\\\\u10d8\\\\u10e7\\\\u10d5\\\\u10d0\\\\u10dc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kk": {"config_name": "unshuffled_original_kk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0422\\\\u04af\\\\u043b\\\\u043a\\\\u0456\\\\u0431\\\\u0430\\\\u0441 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_km": {"config_name": "unshuffled_original_km", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u1781\\\\u17d2\\\\u179f\\\\u17b9\\\\u1794\\\\u178a\\\\u17b6\\\\u1780\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kn": {"config_name": "unshuffled_original_kn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0caa\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ko": {"config_name": "unshuffled_original_ko", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"CIA \\\\ud504\\\\ub85c\\\\uc81d\\\\ud2b8\\\\uc5d0\\\\uc11c\\\\ub294 \\\\u...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_krc": {"config_name": "unshuffled_original_krc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0428\\\\u0430\\\\u043c\\\\u0445\\\\u0430\\\\u043d\\\\u043b\\\\u0430\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ku": {"config_name": "unshuffled_original_ku", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"R\\\\u00eaxistina maf\\\\u00ean mirovan Freedom House r...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kv": {"config_name": "unshuffled_original_kv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041a\\\\u043e\\\\u043c\\\\u0438 \\\\u043a\\\\u044b\\\\u0442\\\\u0448...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_kw": {"config_name": "unshuffled_original_kw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\ud83d\\\\ude4f\\\\ud83c\\\\udffc\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ky": {"config_name": "unshuffled_original_ky", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Turmush: \\\\u0411\\\\u0438\\\\u0448\\\\u043a\\\\u0435\\\\u043a \\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_la": {"config_name": "unshuffled_original_la", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"H\\\\u00e6 sunt generationes No\\\\u00eb: No\\\\u00eb vir ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lb": {"config_name": "unshuffled_original_lb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"W\\\\u00e9i all Joers ass d'Fuesend nees eng m\\\\u00e9...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lez": {"config_name": "unshuffled_original_lez", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0411\\\\u0435\\\\u0441, \\\\u043b\\\\u0435\\\\u0437\\\\u0433\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_li": {"config_name": "unshuffled_original_li", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"'t Good Goedenraad aan de Ezerbaek besjteit oet '...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lmo": {"config_name": "unshuffled_original_lmo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Calvens\\\\u00e0 l'\\\\u00e8 a 24 km de la sit\\\\u00e0 e ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lo": {"config_name": "unshuffled_original_lo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0ea7\\\\u0eb5\\\\u200b\\\\u0ec2\\\\u0ead\\\\u200b\\\\u0ec0\\\\u0ead\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lrc": {"config_name": "unshuffled_original_lrc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0631\\\\u0644\\\\u06cc\\\\u0646\\\\u06af\\\\u062a\\\\u0648\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lt": {"config_name": "unshuffled_original_lt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0160i programa pad\\\\u0117s geriau i\\\\u0161mokti i...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_lv": {"config_name": "unshuffled_original_lv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Divdesmit pirmaj\\\\u0101 apr\\\\u012bl\\\\u012b m\\\\u016bsu...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mai": {"config_name": "unshuffled_original_mai", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u096e \\\\u0938\\\\u093f\\\\u0924\\\\u092e\\\\u094d\\\\u092c\\\\u0930...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mg": {"config_name": "unshuffled_original_mg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Hijery ny Tenim-Pirenena rehetra? Mandika ny tant...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mhr": {"config_name": "unshuffled_original_mhr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u043a\\\\u0440\\\\u0435\\\\u0442 \\\\u0436\\\\u0430\\\\u043f...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_min": {"config_name": "unshuffled_original_min", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"holaholaholaholaholaholaholaholaholaholaholaholah...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mk": {"config_name": "unshuffled_original_mk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201e\\\\u0424\\\\u0438\\\\u043b\\\\u043c \\\\u043f\\\\u043b\\\\u0443...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ml": {"config_name": "unshuffled_original_ml", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0d05\\\\u0d38\\\\u0d2d\\\\u0d4d\\\\u0d2f\\\\u0d35\\\\u0d41\\\\u0d02 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mn": {"config_name": "unshuffled_original_mn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041c\\\\u0423\\\\u0411\\\\u0418\\\\u0421-\\\\u044b\\\\u043d \\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mr": {"config_name": "unshuffled_original_mr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Home / motivational marathi story / \\\\u0909\\\\u0926\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mrj": {"config_name": "unshuffled_original_mrj", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041b\\\\u04f9\\\\u043f\\\\u04f9\\\\u0432\\\\u043b\\\\u04d3 (\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ms": {"config_name": "unshuffled_original_ms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Suhaib memang antara orang yang aktif berprogram....\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mt": {"config_name": "unshuffled_original_mt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"tibg\\\\u0127at il-kaw\\\\u017ca lura lill-Qorti \\\\u0120...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mwl": {"config_name": "unshuffled_original_mwl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Deciplina social i out\\\\u00f3noma que angloba ateb...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_my": {"config_name": "unshuffled_original_my", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u103b\\\\u1019\\\\u1040\\\\u1010\\\\u102e - \\\\u101b\\\\u1014\\\\u10...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_myv": {"config_name": "unshuffled_original_myv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0427\\\\u0430\\\\u0447\\\\u0441\\\\u044c 1914 \\\\u0443\\\\u043c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_mzn": {"config_name": "unshuffled_original_mzn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0642\\\\u0631\\\\u0622\\\\u0646 \\\\u06cc\\\\u0627 \\\\u0642\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nah": {"config_name": "unshuffled_original_nah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"In m\\\\u0101cu\\\\u012blp\\\\u014dhualxihuitl VI (inic ch...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nap": {"config_name": "unshuffled_original_nap", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00f2 AUDIT \\\\u00ed \\\\u00c7 \\\\u00e8 \\\\u00ee \\\\u00ff \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nds": {"config_name": "unshuffled_original_nds", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dor kann sik vun nu af an de hele plattd\\\\u00fc\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ne": {"config_name": "unshuffled_original_ne", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u092c\\\\u0930\\\\u094d\\\\u0926\\\\u093f\\\\u092c\\\\u093e\\\\u0938 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_new": {"config_name": "unshuffled_original_new", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0930\\\\u093e\\\\u0917 \\\\u0938\\\\u0941\\\\u0939\\\\u093e \\\\u091...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nl": {"config_name": "unshuffled_original_nl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op vrijdag 31 augustus wordt het nieuwe studiejaa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_nn": {"config_name": "unshuffled_original_nn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Planomtale krav til innhald Bakgrunn: Sp\\\\u00f8rsm...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_no": {"config_name": "unshuffled_original_no", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Slett midlertidige internett filer og informasjon...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_oc": {"config_name": "unshuffled_original_oc", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"jizzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_or": {"config_name": "unshuffled_original_or", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0b2d\\\\u0b41\\\\u0b2c\\\\u0b28\\\\u0b47\\\\u0b36\\\\u0b4d\\\\u0b71\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_os": {"config_name": "unshuffled_original_os", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1. \\\\u041b\\\\u00e6\\\\u043f\\\\u043f\\\\u0443 \\\\u00e6\\\\u043c\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pa": {"config_name": "unshuffled_original_pa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0a30\\\\u0a1c\\\\u0a3f: \\\\u0a28\\\\u0a70: PB/JL-138/2018-...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pam": {"config_name": "unshuffled_original_pam", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c1ku pu i Anak ning Al\\\\u00e1ya at ngeni ip\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pl": {"config_name": "unshuffled_original_pl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Wszyscy producenci Alkazar Opole Biuro Wydawnicze...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pms": {"config_name": "unshuffled_original_pms", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Dolina (an sloven; San Dorligo della Valle an ita...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pnb": {"config_name": "unshuffled_original_pnb", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0641\\\\u0631\\\\u06cc\\\\u0646\\\\u06a9 \\\\u0628\\\\u0644\\\\u0646...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ps": {"config_name": "unshuffled_original_ps", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Many people usually use the time period \\\\u2018bus...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_pt": {"config_name": "unshuffled_original_pt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Voc\\\\u00ea pode estar lendo este texto no sof\\\\u00e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_qu": {"config_name": "unshuffled_original_qu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Topeka nisqa llaqtaqa, Kansas suyup, Hukllachasqa...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_rm": {"config_name": "unshuffled_original_rm", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"2. secziun Elavuraziun da datas e protecziun da d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ro": {"config_name": "unshuffled_original_ro", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u201c\\\\u00cen via\\\\u021b\\\\u0103, oportunitatea nu e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ru": {"config_name": "unshuffled_original_ru", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0415\\\\u0441\\\\u043b\\\\u0438 \\\\u0432\\\\u0430\\\\u0448\\\\u0438...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sa": {"config_name": "unshuffled_original_sa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0905\\\\u0928\\\\u093f\\\\u0930\\\\u0941\\\\u0926\\\\u094d\\\\u0927\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sah": {"config_name": "unshuffled_original_sah", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\u2588\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_scn": {"config_name": "unshuffled_original_scn", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"La gilus\\\\u00eca \\\\u00e8 nu sintimentu dulurusu ca ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sd": {"config_name": "unshuffled_original_sd", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0647\\\\u0631 \\\\u06aa\\\\u0648 \\\\u0684\\\\u0627\\\\u06bb\\\\u064...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sh": {"config_name": "unshuffled_original_sh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Op\\\\u0161tina Gornja Radgona se nalazi u sjeverois...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_si": {"config_name": "unshuffled_original_si", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0da2\\\\u0db1\\\\u0dcf\\\\u0db0\\\\u0dd2\\\\u0db4\\\\u0dad\\\\u0dd2\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sk": {"config_name": "unshuffled_original_sk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Aktivity | Agent\\\\u00fara podporovan\\\\u00e9ho zames...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sl": {"config_name": "unshuffled_original_sl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u010ce Creatures, ki je \\\\u017eelel, da pridejo n...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_so": {"config_name": "unshuffled_original_so", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\u0442\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sq": {"config_name": "unshuffled_original_sq", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u00c7far\\\\u00eb do t\\\\u00eb m\\\\u00eb p\\\\u00eblqente ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sr": {"config_name": "unshuffled_original_sr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*\\\\u041e\\\\u0432\\\\u0430 \\\\u043f\\\\u043e\\\\u0440\\\\u0443\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_su": {"config_name": "unshuffled_original_su", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Lalaki n\\\\u00e9mbongkeun kakuatan jeung vigor jeun...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sv": {"config_name": "unshuffled_original_sv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"1783 \\\\u00e4r ett viktigt \\\\u00e5rtal i den nya tid...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_sw": {"config_name": "unshuffled_original_sw", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Zarif: Iran inajua mpango wa Saudia wa kufanya ma...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ta": {"config_name": "unshuffled_original_ta", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0baa\\\\u0bca\\\\u0bb4\\\\u0bc1\\\\u0ba4\\\\u0bc1 \\\\u0b9a\\\\u0bbe...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_te": {"config_name": "unshuffled_original_te", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0c39\\\\u0c30\\\\u0c4d\\\\u0c2f\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c32\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tg": {"config_name": "unshuffled_original_tg", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u04b2\\\\u0443\\\\u043c\\\\u0430\\\\u0439\\\\u0440\\\\u043e \\\\u0433...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_th": {"config_name": "unshuffled_original_th", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0e1f\\\\u0e31\\\\u0e19\\\\u0e17\\\\u0e35\\\\u0e48\\\\u0e41\\\\u0e25\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tk": {"config_name": "unshuffled_original_tk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Amerikany\\\\u0148 Kaliforni\\\\u00fda \\\\u015ftatyndaky ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tl": {"config_name": "unshuffled_original_tl", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Warning Signs na Sira ang Kidneys o Bato - ni Doc...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tr": {"config_name": "unshuffled_original_tr", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Son y\\\\u0131llarda g\\\\u00f6r\\\\u00fclen ay tutulmalar...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tt": {"config_name": "unshuffled_original_tt", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\\\\"\\\\u0418\\\\u0440\\\\u0435\\\\u043c\\\\u043d\\\\u0435\\\\u04a3 \\\\u04...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_tyv": {"config_name": "unshuffled_original_tyv", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u042d\\\\u043a\\\\u0438\\\\u0438, \\\\u0445\\\\u04af\\\\u043d\\\\u043...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ug": {"config_name": "unshuffled_original_ug", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0632\\\\u0627\\\\u06ad-\\\\u0621\\\\u062a\\\\u06c7\\\\u0632\\\\u0649...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_uk": {"config_name": "unshuffled_original_uk", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u041f\\\\u0440\\\\u043e \\\\u043d\\\\u0430\\\\u0434\\\\u0430\\\\u043d...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_ur": {"config_name": "unshuffled_original_ur", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0622\\\\u0626\\\\u06cc\\\\u06d2 \\\\u0627\\\\u06c1\\\\u0645 \\\\u062...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_uz": {"config_name": "unshuffled_original_uz", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Markazi Sariosiyo shaharchasi. 1926-yil 29-sentab...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_vec": {"config_name": "unshuffled_original_vec", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Par ogni p\\\\u00f3nto, \\\\u0142a derivada \\\\u0142a xe ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_vi": {"config_name": "unshuffled_original_vi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Canh chua c\\\\u00e1 b\\\\u00f4ng lau kh\\\\u00f4ng ch\\\\u1e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_vo": {"config_name": "unshuffled_original_vo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Mettweiler binon zif in fedal\\\\u00e4n: Rheinland-P...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_wa": {"config_name": "unshuffled_original_wa", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"ci n' est n\\\\u00e9n l' viyaedje lu-minme ki sait e...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_war": {"config_name": "unshuffled_original_war", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"An Tajan amo in usa ka komyun ha departamento han...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_wuu": {"config_name": "unshuffled_original_wuu", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u4f0a15 [I] | \\\\u4f0a17 | \\\\u4f0a19 | \\\\u4f0a21 | \\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_xal": {"config_name": "unshuffled_original_xal", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u0410\\\\u0440\\\\u043d\\\\u0433\\\\u0443\\\\u0434\\\\u0438\\\\u043d ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_xmf": {"config_name": "unshuffled_original_xmf", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u10db\\\\u10dd\\\\u10e9\\\\u10d0\\\\u10db\\\\u10d8\\\\u10da\\\\u10d8 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_yi": {"config_name": "unshuffled_original_yi", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u05de\\\\u05de\\\\u05e9\\\\u05d5\\\\u05ea\\\\u05d3\\\\u05d9\\\\u05e7 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_yo": {"config_name": "unshuffled_original_yo", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"Copyright \\\\u00a9 2018 BBC. BBC k\\\\u00f2 m\\\\u1ecd\\\\u0...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_yue": {"config_name": "unshuffled_original_yue", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"*hughughughughughughughughughughughughughughughug...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}, "unshuffled_original_zh": {"config_name": "unshuffled_original_zh", "sample_row": "{\"id\": \"0\", \"text\": \"\\\"\\\\u65f6\\\\u95f4\\\\u53ef\\\\u4ee5\\\\u88ab\\\\u7f29\\\\u77ed\\\\uff0c\\\\...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "The Open Super-large Crawled ALMAnaCH coRpus is a huge multilingual corpus obtained by language classification and filtering of the Common Crawl corpus using the goclassy architecture.", "dataset_name": "oscar"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:als", "language:am", "language:an", "language:ar", "language:arz", "language:as", "language:ast", "language:av", "language:az", "language:azb", "language:ba", "language:bar", "language:bcl", "language:be", "language:bg", "language:bh", "language:bn", "language:bo", "language:bpy", "language:br", "language:bs", "language:bxr", "language:ca", "language:cbk", "language:ce", "language:ceb", "language:ckb", "language:cs", "language:cv", "language:cy", "language:da", "language:de", "language:diq", "language:dsb", "language:dv", "language:el", "language:eml", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fr", "language:frr", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:gom", "language:gu", "language:he", "language:hi", "language:hr", "language:hsb", "language:ht", "language:hu", "language:hy", "language:ia", "language:id", "language:ie", "language:ilo", "language:io", "language:is", "language:it", "language:ja", "language:jbo", "language:jv", "language:ka", "language:kk", "language:km", "language:kn", "language:ko", "language:krc", "language:ku", "language:kv", "language:kw", "language:ky", "language:la", "language:lb", "language:lez", "language:li", "language:lmo", "language:lo", "language:lrc", "language:lt", "language:lv", "language:mai", "language:mg", "language:mhr", "language:min", "language:mk", "language:ml", "language:mn", "language:mr", "language:mrj", "language:ms", "language:mt", "language:mwl", "language:my", "language:myv", "language:mzn", "language:nah", "language:nap", "language:nds", "language:ne", "language:new", "language:nl", "language:nn", "language:no", "language:oc", "language:or", "language:os", "language:pa", "language:pam", "language:pl", "language:pms", "language:pnb", "language:ps", "language:pt", "language:qu", "language:rm", "language:ro", "language:ru", "language:sa", "language:sah", "language:scn", "language:sd", "language:sh", "language:si", "language:sk", "language:sl", "language:so", "language:sq", "language:sr", "language:su", "language:sv", "language:sw", "language:ta", "language:te", "language:tg", "language:th", "language:tk", "language:tl", "language:tr", "language:tt", "language:tyv", "language:ug", "language:uk", "language:ur", "language:uz", "language:vec", "language:vi", "language:vo", "language:wa", "language:war", "language:wuu", "language:xal", "language:xmf", "language:yi", "language:yo", "language:yue", "language:zh"], "is_gated": false}, "para_pat": {"dataset_name": "para_pat", "description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.", "downloads": 2972, "configs": {"el-en": {"config_name": "el-en", "sample_row": "{\"index\": \"844\", \"family_id\": \"10944407\", \"translation.el\": \"\\\"\\\\u03b1\\\\u03c6\\\\u03ad\\\\u03c2 \\\\u03bf \\\\u03bf\\\\u03c0\\\\u03b...\", \"translation.en\": \"\\\"offee prepared using the mix for Greek coffee eit...\"}", "columns": ["index", "family_id", "translation_el", "translation_en"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.el": "translation_el", "translation.en": "translation_en"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "cs-en": {"config_name": "cs-en", "sample_row": "{\"index\": \"1372\", \"family_id\": \"6516810\", \"translation.cs\": \"\\\"\\\\u0158e\\\\u0161en\\\\u00ed se t\\\\u00fdk\\\\u00e1 herbicid\\\\...\", \"translation.en\": \"\\\"The present invention relates to herbicides and p...\"}", "columns": ["index", "family_id", "translation_cs", "translation_en"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.cs": "translation_cs", "translation.en": "translation_en"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-hu": {"config_name": "en-hu", "sample_row": "{\"index\": \"16\", \"family_id\": \"4180910\", \"translation.en\": \"\\\"Module containing solar cells (7), having two mut...\", \"translation.hu\": \"\\\"Napelemeket (7) tartalmaz\\\\u00f3 modul, amelynek k...\"}", "columns": ["index", "family_id", "translation_en", "translation_hu"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.hu": "translation_hu"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ro": {"config_name": "en-ro", "sample_row": "{\"index\": \"16\", \"family_id\": \"6111771\", \"translation.en\": \"\\\"The invention relates to a process for the prepar...\", \"translation.ro\": \"\\\"Inventia se refera la un procedeu pentru obtinere...\"}", "columns": ["index", "family_id", "translation_en", "translation_ro"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.ro": "translation_ro"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-sk": {"config_name": "en-sk", "sample_row": "{\"index\": \"758\", \"family_id\": \"5346370\", \"translation.en\": \"\\\"Delay is converted from synchronous the culture ...\", \"translation.sk\": \"\\\"Slaehtenie sa p\\\\u0159ev\\\\u00e1d\\\\u00ed zo synchr\\\\u0...\"}", "columns": ["index", "family_id", "translation_en", "translation_sk"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.sk": "translation_sk"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-uk": {"config_name": "en-uk", "sample_row": "{\"index\": \"3421\", \"family_id\": \"52275661\", \"translation.en\": \"\\\"A replaceable handle to kitchen appliances compri...\", \"translation.uk\": \"\\\"\\\\u0417\\\\u043d\\\\u0456\\\\u043c\\\\u043d\\\\u0430 \\\\u0440\\\\u0443...\"}", "columns": ["index", "family_id", "translation_en", "translation_uk"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.uk": "translation_uk"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "es-fr": {"config_name": "es-fr", "sample_row": "{\"index\": \"3077\", \"family_id\": \"8244348\", \"translation.es\": \"\\\"La presente invenci\\\\u00f3n se refiere a un proced...\", \"translation.fr\": \"\\\"L'invention concerne un proc\\\\u00e9d\\\\u00e9 de fabr...\"}", "columns": ["index", "family_id", "translation_es", "translation_fr"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.es": "translation_es", "translation.fr": "translation_fr"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "fr-ru": {"config_name": "fr-ru", "sample_row": "{\"index\": \"18646\", \"family_id\": \"38723544\", \"translation.fr\": \"\\\"L'invention appartient au domaine de la microbiol...\", \"translation.ru\": \"\\\"\\\\u0418\\\\u0437\\\\u043e\\\\u0431\\\\u0440\\\\u0435\\\\u0442\\\\u0435\\\\...\"}", "columns": ["index", "family_id", "translation_fr", "translation_ru"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.fr": "translation_fr", "translation.ru": "translation_ru"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"translation.de\": \"\\\"Der Signalaustausch zwischen den Funktionseinheit...\", \"translation.fr\": \"\\\"L'\\\\u00e9change de signaux entre les unit\\\\u00e9s f...\"}", "columns": ["translation_de", "translation_fr"], "columns_mapping": {"translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ja": {"config_name": "en-ja", "sample_row": "{\"translation.en\": \"\\\"The computer system (200) is connected to a datab...\", \"translation.ja\": \"\\\"\\\\u30b3\\\\u30f3\\\\u30d4\\\\u30e5\\\\u30fc\\\\u30bf\\\\u30b7\\\\u30b9\\\\...\"}", "columns": ["translation_en", "translation_ja"], "columns_mapping": {"translation.en": "translation_en", "translation.ja": "translation_ja"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-es": {"config_name": "en-es", "sample_row": "{\"translation.en\": \"\\\"A method for converting a series of m-bit informa...\", \"translation.es\": \"\\\"Se describe un m\\\\u00e9todo para convertir una ser...\"}", "columns": ["translation_en", "translation_es"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"translation.en\": \"\\\"According to the invention, the method (1) compri...\", \"translation.fr\": \"\\\"Selon l'invention, le proc\\\\u00e9d\\\\u00e9 (1) compr...\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "de-en": {"config_name": "de-en", "sample_row": "{\"translation.de\": \"\\\"Das Textilband (1) ist insbesondere als (Kreuz-) ...\", \"translation.en\": \"\\\"The textile band (1) is used particularly as (cro...\"}", "columns": ["translation_de", "translation_en"], "columns_mapping": {"translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ko": {"config_name": "en-ko", "sample_row": "{\"translation.en\": \"\\\"In a preferred embodiment, the TAMS includes a pr...\", \"translation.ko\": \"\\\"\\\\ubc14\\\\ub78c\\\\uc9c1\\\\ud55c \\\\uc2e4\\\\uc2dc\\\\uc608\\\\uc5d0...\"}", "columns": ["translation_en", "translation_ko"], "columns_mapping": {"translation.en": "translation_en", "translation.ko": "translation_ko"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "fr-ja": {"config_name": "fr-ja", "sample_row": "{\"translation.fr\": \"\\\"Le dispositif de transmission transmet un signal ...\", \"translation.ja\": \"\\\"\\\\u3010\\\\u89e3\\\\u6c7a\\\\u624b\\\\u6bb5\\\\u3011\\\\u672c\\\\u6280\\\\...\"}", "columns": ["translation_fr", "translation_ja"], "columns_mapping": {"translation.fr": "translation_fr", "translation.ja": "translation_ja"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"translation.en\": \"\\\"The door lock is characterized in that fluorescen...\", \"translation.zh\": \"\\\"\\\\u672c\\\\u5b9e\\\\u7528\\\\u65b0\\\\u578b\\\\u5305\\\\u62ec\\\\u95e8\\\\...\"}", "columns": ["translation_en", "translation_zh"], "columns_mapping": {"translation.en": "translation_en", "translation.zh": "translation_zh"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-ru": {"config_name": "en-ru", "sample_row": "{\"translation.en\": \"\\\"S from 00 00\\\"\", \"translation.ru\": \"\\\"S \\\\u0441\\\\u043e 00 00\\\"\"}", "columns": ["translation_en", "translation_ru"], "columns_mapping": {"translation.en": "translation_en", "translation.ru": "translation_ru"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "fr-ko": {"config_name": "fr-ko", "sample_row": "{\"index\": \"10794\", \"family_id\": \"34746474\", \"translation.fr\": \"\\\"La pr\\\\u00e9sente invention concerne un proc\\\\u00e9...\", \"translation.ko\": \"\\\"\\\\ubcf8 \\\\ubc1c\\\\uba85\\\\uc740 \\\\uc6a9\\\\ub9e4\\\\uc911\\\\uc75...\"}", "columns": ["index", "family_id", "translation_fr", "translation_ko"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.fr": "translation_fr", "translation.ko": "translation_ko"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "ru-uk": {"config_name": "ru-uk", "sample_row": "{\"index\": \"3431\", \"family_id\": \"52281850\", \"translation.ru\": \"\\\"\\\\u0421\\\\u043f\\\\u043e\\\\u0441\\\\u043e\\\\u0431 \\\\u0432\\\\u044b...\", \"translation.uk\": \"\\\"\\\\u0421\\\\u043f\\\\u043e\\\\u0441\\\\u0456\\\\u0431 \\\\u0432\\\\u0438...\"}", "columns": ["index", "family_id", "translation_ru", "translation_uk"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.ru": "translation_ru", "translation.uk": "translation_uk"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}, "en-pt": {"config_name": "en-pt", "sample_row": "{\"index\": \"22818\", \"family_id\": \"40951751\", \"translation.en\": \"\\\"The present invention relates to a process for th...\", \"translation.pt\": \"\\\"A presente inven\\\\u00e7\\\\u00e3o refere-se a um proc...\"}", "columns": ["index", "family_id", "translation_en", "translation_pt"], "columns_mapping": {"index": "index", "family_id": "family_id", "translation.en": "translation_en", "translation.pt": "translation_pt"}, "dataset_description": "ParaPat: The Multi-Million Sentences Parallel Corpus of Patents Abstracts\n\nThis dataset contains the developed parallel corpus from the open access Google\nPatents dataset in 74 language pairs, comprising more than 68 million sentences\nand 800 million tokens. Sentences were automatically aligned using the Hunalign algorithm\nfor the largest 22 language pairs, while the others were abstract (i.e. paragraph) aligned.\n\n", "dataset_name": "para_pat"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:translation", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:machine-generated", "multilinguality:translation", "source_datasets:original", "language:cs", "language:de", "language:el", "language:en", "language:es", "language:fr", "language:hu", "language:ja", "language:ko", "language:pt", "language:ro", "language:ru", "language:sk", "language:uk", "language:zh"], "is_gated": false}, "parsinlu_reading_comprehension": {"dataset_name": "parsinlu_reading_comprehension", "description": "A Persian reading comprehenion task (generating an answer, given a question and a context paragraph).\nThe questions are mined using Google auto-complete, their answers and the corresponding evidence documents are manually annotated by native speakers.", "downloads": 304, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"question\": \"\\\"\\\\u0686\\\\u0631\\\\u0627 \\\\u0622\\\\u0645\\\\u0631\\\\u06cc\\\\u06a9...\", \"url\": \"\\\"https://www.bbc.com/persian/iran-46851613\\\"\", \"context\": \"\\\"\\\\u0644\\\\u0647\\\\u0633\\\\u062a\\\\u0627\\\\u0646 \\\\u06cc\\\\u06a9...\", \"answers.answer_start\": \"[427]\", \"answers.answer_text\": \"[\\\"\\\\u0646\\\\u0632\\\\u062f\\\\u06cc\\\\u06a9\\\\u06cc \\\\u0631\\\\u064...\"}", "columns": ["question", "url", "context", "answers_answer_start", "answers_answer_text"], "columns_mapping": {"question": "question", "url": "url", "context": "context", "answers.answer_start": "answers_answer_start", "answers.answer_text": "answers_answer_text"}, "dataset_description": "A Persian reading comprehenion task (generating an answer, given a question and a context paragraph).\nThe questions are mined using Google auto-complete, their answers and the corresponding evidence documents are manually annotated by native speakers.\n", "dataset_name": "parsinlu_reading_comprehension"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|wikipedia|google", "language:fa"], "is_gated": false}, "paws-x": {"dataset_name": "paws-x", "description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "downloads": 26191, "configs": {"en": {"config_name": "en", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"In Paris , in October 1560 , he secretly met the ...\", \"sentence2\": \"\\\"In October 1560 , he secretly met with the Englis...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"Im Oktober 1560 traf er sich in Paris heimlich mi...\", \"sentence2\": \"\\\"Im Oktober 1560 traf er sich heimlich mit dem eng...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"En Par\\\\u00eds, en octubre de 1560, se reuni\\\\u00f3...\", \"sentence2\": \"\\\"En octubre de 1560, se reuni\\\\u00f3 en secreto con...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"\\\\u00c0 Paris, en octobre 1560, il rencontra secr\\\\...\", \"sentence2\": \"\\\"En octobre 1560, il rencontra secr\\\\u00e8tement l'...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "ja": {"config_name": "ja", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\u306b\\\\u30d1\\\\u30ea\\\\u3067\\\\u3001\\\\...\", \"sentence2\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\u3001\\\\u5f7c\\\\u306f\\\\u30d1\\\\u30ea\\\\...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "ko": {"config_name": "ko", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"1560 \\\\ub144 10 \\\\uc6d4 \\\\ud30c\\\\ub9ac\\\\uc5d0\\\\uc11c \\\\u...\", \"sentence2\": \"\\\"1560 \\\\ub144 10 \\\\uc6d4 \\\\uadf8\\\\ub294 \\\\ud30c\\\\ub9ac\\\\u...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}, "zh": {"config_name": "zh", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\uff0c\\\\u4ed6\\\\u5728\\\\u5df4\\\\u9ece\\\\...\", \"sentence2\": \"\\\"1560\\\\u5e7410\\\\u6708\\\\uff0c\\\\u4ed6\\\\u5728\\\\u5df4\\\\u9ece\\\\...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS-X, a multilingual version of PAWS (Paraphrase Adversaries from Word Scrambling) for six languages.\n\nThis dataset contains 23,659 human translated PAWS evaluation pairs and 296,406 machine\ntranslated training pairs in six typologically distinct languages: French, Spanish, German,\nChinese, Japanese, and Korean. English language is available by default. All translated\npairs are sourced from examples in PAWS-Wiki.\n\nFor further details, see the accompanying paper: PAWS-X: A Cross-lingual Adversarial Dataset\nfor Paraphrase Identification (https://arxiv.org/abs/1908.11828)\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws-x"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:extended|other-paws", "language:de", "language:en", "language:es", "language:fr", "language:ja", "language:ko", "language:zh", "paraphrase-identification"], "is_gated": false}, "paws": {"dataset_name": "paws", "description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.", "downloads": 8002, "configs": {"labeled_final": {"config_name": "labeled_final", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"In Paris , in October 1560 , he secretly met the ...\", \"sentence2\": \"\\\"In October 1560 , he secretly met with the Englis...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws"}, "labeled_swap": {"config_name": "labeled_swap", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"`` B. i. seychellarum '' is smaller and shorter-w...\", \"sentence2\": \"\\\"`` B. i. seychellarum '' is smaller and shorter-o...\", \"label\": \"0\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws"}, "unlabeled_final": {"config_name": "unlabeled_final", "sample_row": "{\"id\": \"1\", \"sentence1\": \"\\\"The film was remade in Telugu with the same name ...\", \"sentence2\": \"\\\"The film was written in Telugu with the same name...\", \"label\": \"1\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "PAWS: Paraphrase Adversaries from Word Scrambling\n\nThis dataset contains 108,463 human-labeled and 656k noisily labeled pairs that feature\nthe importance of modeling structure, context, and word order information for the problem\nof paraphrase identification. The dataset has two subsets, one based on Wikipedia and the\nother one based on the Quora Question Pairs (QQP) dataset.\n\nFor further details, see the accompanying paper: PAWS: Paraphrase Adversaries from Word Scrambling\n(https://arxiv.org/abs/1904.01130)\n\nPAWS-QQP is not available due to license of QQP. It must be reconstructed by downloading the original\ndata and then running our scripts to produce the data and attach the labels.\n\nNOTE: There might be some missing or wrong labels in the dataset and we have replaced them with -1.\n", "dataset_name": "paws"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "task_ids:multi-input-text-classification", "annotations_creators:expert-generated", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "paraphrase-identification"], "is_gated": false}, "pec": {"dataset_name": "pec", "description": "\\\r\nA dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.", "downloads": 562, "configs": {"happy": {"config_name": "happy", "sample_row": "{\"personas\": \"[\\\"i have a roku tv that came with a shitty basic r...\", \"context\": \"[\\\"found out this morning i got a job promotion ! !...\", \"context_speakers\": \"[\\\"HeWentToJared91\\\"]\", \"response\": \"\\\"whilst popping ?\\\"\", \"response_speaker\": \"\\\"Evref\\\"\"}", "columns": ["personas", "context", "context_speakers", "response", "response_speaker"], "columns_mapping": {"personas": "personas", "context": "context", "context_speakers": "context_speakers", "response": "response", "response_speaker": "response_speaker"}, "dataset_description": "A dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.\n", "dataset_name": "pec"}, "offmychest": {"config_name": "offmychest", "sample_row": "{\"personas\": \"[\\\"i do n't want to see it .\\\", \\\"i 'd love to know m...\", \"context\": \"[\\\"i want to die . in last few months i lost my job...\", \"context_speakers\": \"[\\\"1wannadie\\\"]\", \"response\": \"\\\"hold on to life , look around you and realise wha...\", \"response_speaker\": \"\\\"Lulwafahd\\\"\"}", "columns": ["personas", "context", "context_speakers", "response", "response_speaker"], "columns_mapping": {"personas": "personas", "context": "context", "context_speakers": "context_speakers", "response": "response", "response_speaker": "response_speaker"}, "dataset_description": "A dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.\n", "dataset_name": "pec"}, "all": {"config_name": "all", "sample_row": "{\"personas\": \"[\\\"i have a roku tv that came with a shitty basic r...\", \"context\": \"[\\\"found out this morning i got a job promotion ! !...\", \"context_speakers\": \"[\\\"HeWentToJared91\\\"]\", \"response\": \"\\\"whilst popping ?\\\"\", \"response_speaker\": \"\\\"Evref\\\"\"}", "columns": ["personas", "context", "context_speakers", "response", "response_speaker"], "columns_mapping": {"personas": "personas", "context": "context", "context_speakers": "context_speakers", "response": "response", "response_speaker": "response_speaker"}, "dataset_description": "A dataset of around 350K persona-based empathetic conversations. Each speaker is associated with a persona, which comprises multiple persona sentences. The response of each conversation is empathetic.\n", "dataset_name": "pec"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:text-retrieval", "task_ids:dialogue-modeling", "task_ids:utterance-retrieval", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "peoples_daily_ner": {"dataset_name": "peoples_daily_ner", "description": "People's Daily NER Dataset is a commonly used dataset for Chinese NER, with\ntext from People's Daily (\u4eba\u6c11\u65e5\u62a5), the largest official newspaper.\n\nThe dataset is in BIO scheme. Entity types are: PER (person), ORG (organization)\nand LOC (location).", "downloads": 829, "configs": {"peoples_daily_ner": {"config_name": "peoples_daily_ner", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u6d77\\\", \\\"\\\\u9493\\\", \\\"\\\\u6bd4\\\", \\\"\\\\u8d5b\\\", \\\"\\\\u5730\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 5, 6, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "People's Daily NER Dataset is a commonly used dataset for Chinese NER, with\ntext from People's Daily (\u4eba\u6c11\u65e5\u62a5), the largest official newspaper.\n\nThe dataset is in BIO scheme. Entity types are: PER (person), ORG (organization)\nand LOC (location).\n", "dataset_name": "peoples_daily_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "persian_ner": {"dataset_name": "persian_ner", "description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.", "downloads": 560, "configs": {"fold1": {"config_name": "fold1", "sample_row": "{\"tokens\": \"[\\\"\\\\u0628\\\\u0647\\\", \\\"\\\\u0639\\\\u0646\\\\u0648\\\\u0627\\\\u0646\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["tokens", "ner_tags"], "columns_mapping": {"tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.\n", "dataset_name": "persian_ner"}, "fold2": {"config_name": "fold2", "sample_row": "{\"tokens\": \"[\\\"\\\\u0627\\\\u0641\\\\u0642\\\\u06cc\\\", \\\":\\\", \\\"0\\\", \\\"\\\\u0640\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0...\"}", "columns": ["tokens", "ner_tags"], "columns_mapping": {"tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.\n", "dataset_name": "persian_ner"}, "fold3": {"config_name": "fold3", "sample_row": "{\"tokens\": \"[\\\"\\\\u0627\\\\u0641\\\\u0642\\\\u06cc\\\", \\\":\\\", \\\"0\\\", \\\"\\\\u0640\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0...\"}", "columns": ["tokens", "ner_tags"], "columns_mapping": {"tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The dataset includes 250,015 tokens and 7,682 Persian sentences in total. It is available in 3 folds to be used in turn as training and test sets. The NER tags are in IOB format.\n", "dataset_name": "persian_ner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:fa"], "is_gated": false}, "php": {"dataset_name": "php", "description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M", "downloads": 838, "configs": {"fi-nl": {"config_name": "fi-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"PHP K\\\\u00e4sikirja\\\"\", \"translation.nl\": \"\\\"PHP Handleiding\\\"\"}", "columns": ["id", "translation_fi", "translation_nl"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.nl": "translation_nl"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "it-ro": {"config_name": "it-ro", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.it\": \"\\\"Manuale PHP\\\"\", \"translation.ro\": \"\\\"Manual PHP\\\"\"}", "columns": ["id", "translation_it", "translation_ro"], "columns_mapping": {"id": "id", "translation.it": "translation_it", "translation.ro": "translation_ro"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "nl-sv": {"config_name": "nl-sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.nl\": \"\\\"PHP Handleiding\\\"\", \"translation.sv\": \"\\\"PHP-manual\\\"\"}", "columns": ["id", "translation_nl", "translation_sv"], "columns_mapping": {"id": "id", "translation.nl": "translation_nl", "translation.sv": "translation_sv"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "en-it": {"config_name": "en-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"PHP Manual\\\"\", \"translation.it\": \"\\\"Manuale PHP\\\"\"}", "columns": ["id", "translation_en", "translation_it"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.it": "translation_it"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"PHP Manual\\\"\", \"translation.fr\": \"\\\"Manuel PHP\\\"\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "A parallel corpus originally extracted from http://se.php.net/download-docs.php. The original documents are written in English and have been partly translated into 21 languages. The original manuals contain about 500,000 words. The amount of actually translated texts varies for different languages between 50,000 and 380,000 words. The corpus is rather noisy and may include parts from the English original in some of the translations. The corpus is tokenized and each language pair has been sentence aligned.\n\n23 languages, 252 bitexts\ntotal number of files: 71,414\ntotal number of tokens: 3.28M\ntotal number of sentence fragments: 1.38M\n", "dataset_name": "php"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:cs", "language:de", "language:en", "language:es", "language:fi", "language:fr", "language:he", "language:hu", "language:it", "language:ja", "language:ko", "language:nl", "language:pl", "language:pt", "language:ro", "language:ru", "language:sk", "language:sl", "language:sv", "language:tr", "language:tw", "language:zh"], "is_gated": false}, "etalab-ia/piaf": {"dataset_name": "etalab-ia/piaf", "description": "Piaf is a reading comprehension dataset. This version, published in February 2020, contains 3835 questions on French Wikipedia.", "downloads": 307, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"p140295443291664\\\"\", \"title\": \"\\\"Sport\\\"\", \"context\": \"\\\"Les d\\\\u00e9penses des m\\\\u00e9nages repr\\\\u00e9sent...\", \"question\": \"\\\"Combien de personnes travaillent au minist\\\\u00e8r...\", \"answers.text\": \"[\\\"100 000\\\"]\", \"answers.answer_start\": \"[472]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Piaf is a reading comprehension dataset. This version, published in February 2020, contains 3835 questions on French Wikipedia.\n", "dataset_name": "etalab-ia/piaf"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:fr"], "is_gated": false}, "gsarti/clean_mc4_it": {"dataset_name": "gsarti/clean_mc4_it", "description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "downloads": 1135, "configs": {"tiny": {"config_name": "tiny", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "medium": {"config_name": "medium", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "large": {"config_name": "large", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}, "full": {"config_name": "full", "sample_row": "{\"text\": \"\\\"Per raggiungere il campo attraversiamo la strisci...\", \"timestamp\": \"\\\"2020-02-22T22:24:31Z\\\"\", \"url\": \"\\\"https://altreconomia.it/una-rotonda-sul-pane/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "gsarti/clean_mc4_it"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended", "language:it"], "is_gated": false}, "gsarti/itacola": {"dataset_name": "gsarti/itacola", "description": "The Italian Corpus of Linguistic Acceptability includes almost 10k sentences taken from \nlinguistic literature with a binary annotation made by the original authors themselves. \nThe work is inspired by the English Corpus of Linguistic Acceptability (CoLA) by Warstadt et al.\nPart of the dataset has been manually annotated to highlight 9 linguistic phenomena.", "downloads": 468, "configs": {"scores": {"config_name": "scores", "sample_row": "{\"unique_id\": \"1\", \"source\": \"\\\"Graffi_1994\\\"\", \"acceptability\": \"1\", \"sentence\": \"\\\"Quest'uomo mi ha colpito.\\\"\"}", "columns": ["unique_id", "source", "acceptability", "sentence"], "columns_mapping": {"unique_id": "unique_id", "source": "source", "acceptability": "acceptability", "sentence": "sentence"}, "dataset_description": "The Italian Corpus of Linguistic Acceptability includes almost 10k sentences taken from \nlinguistic literature with a binary annotation made by the original authors themselves. \nThe work is inspired by the English Corpus of Linguistic Acceptability (CoLA) by Warstadt et al.\nPart of the dataset has been manually annotated to highlight 9 linguistic phenomena.\n", "dataset_name": "gsarti/itacola"}, "phenomena": {"config_name": "phenomena", "sample_row": "{\"unique_id\": \"1\", \"source\": \"\\\"Graffi_1994\\\"\", \"acceptability\": \"1\", \"sentence\": \"\\\"Quest'uomo mi ha colpito.\\\"\", \"cleft_construction\": \"0\", \"copular_construction\": \"0\", \"subject_verb_agreement\": \"1\", \"wh_islands_violations\": \"0\", \"simple\": \"0\", \"question\": \"0\", \"auxiliary\": \"1\", \"bind\": \"0\", \"indefinite_pronouns\": \"0\"}", "columns": ["unique_id", "source", "acceptability", "sentence", "cleft_construction", "copular_construction", "subject_verb_agreement", "wh_islands_violations", "simple", "question", "auxiliary", "bind", "indefinite_pronouns"], "columns_mapping": {"unique_id": "unique_id", "source": "source", "acceptability": "acceptability", "sentence": "sentence", "cleft_construction": "cleft_construction", "copular_construction": "copular_construction", "subject_verb_agreement": "subject_verb_agreement", "wh_islands_violations": "wh_islands_violations", "simple": "simple", "question": "question", "auxiliary": "auxiliary", "bind": "bind", "indefinite_pronouns": "indefinite_pronouns"}, "dataset_description": "The Italian Corpus of Linguistic Acceptability includes almost 10k sentences taken from \nlinguistic literature with a binary annotation made by the original authors themselves. \nThe work is inspired by the English Corpus of Linguistic Acceptability (CoLA) by Warstadt et al.\nPart of the dataset has been manually annotated to highlight 9 linguistic phenomena.\n", "dataset_name": "gsarti/itacola"}}, "tags": ["task_categories:text-classification", "task_ids:acceptability-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:it"], "is_gated": false}, "jegormeister/dutch-snli": {"dataset_name": "jegormeister/dutch-snli", "description": "This is the Dutch version of the original SNLI dataset. The translation was performed using Google Translate. Original SNLI available at https://nlp.stanford.edu/projects/snli/", "downloads": 277, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"premise\": \"\\\"Een persoon op een paard springt over een kapot v...\", \"hypothesis\": \"\\\"Een persoon traint zijn paard voor een wedstrijd....\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "This is the Dutch version of the original SNLI dataset. The translation was performed using Google Translate. Original SNLI available at https://nlp.stanford.edu/projects/snli/\n", "dataset_name": "jegormeister/dutch-snli"}}, "tags": ["language:nl"], "is_gated": false}, "ju-bezdek/conll2003-SK-NER": {"dataset_name": "ju-bezdek/conll2003-SK-NER", "description": "This is translated version of the original CONLL2003 dataset (translated from English to Slovak via Google translate) Annotation was done mostly automatically with word matching scripts. Records where some tags were not matched, were annotated manually (10%) Unlike the original Conll2003 dataset, this one contains only NER tags", "downloads": 11, "configs": {"conll2003-SK-NER": {"config_name": "conll2003-SK-NER", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"E\\\\u00da\\\", \\\"odmieta\\\", \\\"nemeck\\\\u00fa\\\", \\\"v\\\\u00fdzvu...\", \"ner_tags\": \"[3, 0, 7, 0, 0, 0, 7, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "This is translated version of the original CONLL2003 dataset (translated from English to Slovak via Google translate) Annotation was done mostly automatically with word matching scripts. Records where some tags were not matched, were annotated manually (10%) Unlike the original Conll2003 dataset, this one contains only NER tags\n", "dataset_name": "ju-bezdek/conll2003-SK-NER"}}, "tags": ["task_categories:other", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|conll2003", "language:sk", "structure-prediction"], "is_gated": false}, "k-halid/ar": {"dataset_name": "k-halid/ar", "description": "The corpus is a part of the MultiUN corpus.It is a collection of translated documents from the United Nations.The corpus is download from the following website : [open parallel corpus](http://opus.datasetsl.eu/) \\", "downloads": 12, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"\\\\u0631\\\\u0633\\\\u0627\\\\u0644\\\\u0629 \\\\u0645\\\\u0624\\\\u0631...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "The corpus is a part of the MultiUN corpus.It is a collection of translated documents from the United Nations.The corpus is download from the following website : [open parallel corpus](http://opus.datasetsl.eu/) ", "dataset_name": "k-halid/ar"}}, "tags": [], "is_gated": false}, "lavis-nlp/german_legal_sentences": {"dataset_name": "lavis-nlp/german_legal_sentences", "description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)", "downloads": 29, "configs": {"sentences": {"config_name": "sentences", "sample_row": "{\"sent_id\": \"1710015\", \"doc_id\": \"201218\", \"text\": \"\\\"Die nach [REF] zul\\\\u00e4ssige Beschwerde ist auch...\", \"references.ref_id\": \"[6565]\", \"references.name\": \"[\\\"\\\\u00a7 127 Abs. 2 Satz 2 ZPO\\\"]\", \"references.type\": \"[1]\"}", "columns": ["sent_id", "doc_id", "text", "references_ref_id", "references_name", "references_type"], "columns_mapping": {"sent_id": "sent_id", "doc_id": "doc_id", "text": "text", "references.ref_id": "references_ref_id", "references.name": "references_name", "references.type": "references_type"}, "dataset_description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)\n", "dataset_name": "lavis-nlp/german_legal_sentences"}, "pairs": {"config_name": "pairs", "sample_row": "{\"query.sent_id\": \"0\", \"query.doc_id\": \"0\", \"query.text\": \"\\\"Gem\\\\u00e4\\\\u00df [REF] kann der Vertrieb eines nac...\", \"query.ref_ids\": \"[8]\", \"related.sent_id\": \"167082\", \"related.doc_id\": \"14964\", \"related.text\": \"\\\"Die Revision wendet sich mit Erfolg gegen die Ann...\", \"related.ref_ids\": \"[141578, 8]\"}", "columns": ["query_sent_id", "query_doc_id", "query_text", "query_ref_ids", "related_sent_id", "related_doc_id", "related_text", "related_ref_ids"], "columns_mapping": {"query.sent_id": "query_sent_id", "query.doc_id": "query_doc_id", "query.text": "query_text", "query.ref_ids": "query_ref_ids", "related.sent_id": "related_sent_id", "related.doc_id": "related_doc_id", "related.text": "related_text", "related.ref_ids": "related_ref_ids"}, "dataset_description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)\n", "dataset_name": "lavis-nlp/german_legal_sentences"}, "pairs+es": {"config_name": "pairs+es", "sample_row": "{\"query.sent_id\": \"0\", \"query.doc_id\": \"0\", \"query.text\": \"\\\"Gem\\\\u00e4\\\\u00df [REF] kann der Vertrieb eines nac...\", \"query.ref_ids\": \"[8]\", \"related.sent_id\": \"167082\", \"related.doc_id\": \"14964\", \"related.text\": \"\\\"Die Revision wendet sich mit Erfolg gegen die Ann...\", \"related.ref_ids\": \"[141578, 8]\", \"es_neighbors.text\": \"[\\\"Besondere Umst\\\\u00e4nde , die das Verhalten des ...\", \"es_neighbors.sent_id\": \"[1349763, 1407242, 996686, 751840, 304375]\", \"es_neighbors.doc_id\": \"[149748, 156752, 107019, 78568, 28812]\", \"es_neighbors.ref_ids\": \"[[399], [691], [15], [141578], [7115, 62763]]\"}", "columns": ["query_sent_id", "query_doc_id", "query_text", "query_ref_ids", "related_sent_id", "related_doc_id", "related_text", "related_ref_ids", "es_neighbors_text", "es_neighbors_sent_id", "es_neighbors_doc_id", "es_neighbors_ref_ids"], "columns_mapping": {"query.sent_id": "query_sent_id", "query.doc_id": "query_doc_id", "query.text": "query_text", "query.ref_ids": "query_ref_ids", "related.sent_id": "related_sent_id", "related.doc_id": "related_doc_id", "related.text": "related_text", "related.ref_ids": "related_ref_ids", "es_neighbors.text": "es_neighbors_text", "es_neighbors.sent_id": "es_neighbors_sent_id", "es_neighbors.doc_id": "es_neighbors_doc_id", "es_neighbors.ref_ids": "es_neighbors_ref_ids"}, "dataset_description": "German Legal Sentences (GLS) is an automatically generated training dataset for semantic sentence \nmatching in the domain in german legal documents. It follows the concept of weak supervision, where \nimperfect labels are generated using multiple heuristics. For this purpose we use a combination of \nlegal citation matching and BM25 similarity. The contained sentences and their citations are parsed \nfrom real judicial decisions provided by [Open Legal Data](http://openlegaldata.io/)\n", "dataset_name": "lavis-nlp/german_legal_sentences"}}, "tags": ["task_categories:text-retrieval", "task_ids:semantic-similarity-scoring", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:de"], "is_gated": false}, "lhoestq/test": {"dataset_name": "lhoestq/test", "description": "This is a test dataset.", "downloads": 355, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"- Hello there !\\\"\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "This is a test dataset.\n", "dataset_name": "lhoestq/test"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "liweili/c4_200m": {"dataset_name": "liweili/c4_200m", "description": "\\\r\nGEC Dataset Generated from C4", "downloads": 29, "configs": {"default": {"config_name": "default", "sample_row": "{\"input\": \"\\\"Bitcoin is for $7,094 this morning, which CoinDes...\", \"output\": \"\\\"Bitcoin goes for $7,094 this morning, according t...\"}", "columns": ["input", "output"], "columns_mapping": {"input": "input", "output": "output"}, "dataset_description": "GEC Dataset Generated from C4\n", "dataset_name": "liweili/c4_200m"}}, "tags": ["task_categories:text-generation", "source_datasets:allenai/c4", "language:en", "grammatical-error-correction"], "is_gated": false}, "metaeval/blimp_classification": {"dataset_name": "metaeval/blimp_classification", "description": "Acceptable/non acceptable sentences (recasted as a classification task)", "downloads": 45, "configs": {"semantics": {"config_name": "semantics", "sample_row": "{\"sentence\": \"\\\"There was each vase aggravating Carol.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "syntax": {"config_name": "syntax", "sample_row": "{\"sentence\": \"\\\"Many senators were collaborated by Lucille.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "morphology": {"config_name": "morphology", "sample_row": "{\"sentence\": \"\\\"Some girl hired that pedestrians.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "syntax+semantics": {"config_name": "syntax+semantics", "sample_row": "{\"sentence\": \"\\\"Debra is imagining herself looking like these pho...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}, "syntax_semantics": {"config_name": "syntax_semantics", "sample_row": "{\"sentence\": \"\\\"Ronald preferred it to be interesting that Rhonda...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "Acceptable/non acceptable sentences (recasted as a classification task)\n", "dataset_name": "metaeval/blimp_classification"}}, "tags": ["task_categories:text-classification", "task_ids:acceptability-classification", "language:en", "cola"], "is_gated": false}, "tasksource/crowdflower": {"dataset_name": "tasksource/crowdflower", "description": "Collection of crowdflower classification datasets", "downloads": 78, "configs": {"sentiment_nuclear_power": {"config_name": "sentiment_nuclear_power", "sample_row": "{\"text\": \"\\\":Hello Japan is a nuclear power plant crisis. {li...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "tweet_global_warming": {"config_name": "tweet_global_warming", "sample_row": "{\"text\": \"\\\"Global warming report urges governments to act|BR...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "airline-sentiment": {"config_name": "airline-sentiment", "sample_row": "{\"text\": \"\\\"@VirginAmerica What @dhepburn said.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "corporate-messaging": {"config_name": "corporate-messaging", "sample_row": "{\"text\": \"\\\"Barclays CEO stresses the importance of regulator...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "economic-news": {"config_name": "economic-news", "sample_row": "{\"text\": \"\\\"\\\\\\\"NEW YORK -- Yields on most certificates of depo...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "political-media-audience": {"config_name": "political-media-audience", "sample_row": "{\"text\": \"\\\"RT @nowthisnews: Rep. Trey Radel (R- #FL) slams #...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "political-media-bias": {"config_name": "political-media-bias", "sample_row": "{\"text\": \"\\\"RT @nowthisnews: Rep. Trey Radel (R- #FL) slams #...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "political-media-message": {"config_name": "political-media-message", "sample_row": "{\"text\": \"\\\"RT @nowthisnews: Rep. Trey Radel (R- #FL) slams #...\", \"label\": \"2\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}, "text_emotion": {"config_name": "text_emotion", "sample_row": "{\"text\": \"\\\"@tiffanylue i know i was listenin to bad habit e...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "\nCollection of crowdflower classification datasets\n", "dataset_name": "tasksource/crowdflower"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en"], "is_gated": false}, "metaeval/ethics": {"dataset_name": "metaeval/ethics", "description": "Probing for ethics understanding", "downloads": 1730, "configs": {"commonsense": {"config_name": "commonsense", "sample_row": "{\"text\": \"\\\"I went to the principal's office to change my rec...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}, "deontology": {"config_name": "deontology", "sample_row": "{\"text\": \"\\\"Aren't you supposed to be preparing for the meeti...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}, "justice": {"config_name": "justice", "sample_row": "{\"text\": \"\\\"I deserve to be paid by my boss because I keep he...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}, "virtue": {"config_name": "virtue", "sample_row": "{\"sentence1\": \"\\\"Jill wore a new style of distinctive clothes each...\", \"sentence2\": \"\\\" discredit\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence1", "sentence2", "label", "idx"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label", "idx": "idx"}, "dataset_description": "Probing for ethics understanding\n", "dataset_name": "metaeval/ethics"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en"], "is_gated": false}, "metaeval/linguisticprobing": {"dataset_name": "metaeval/linguisticprobing", "description": "10 probing tasks designed to capture simple linguistic features of sentences,", "downloads": 131, "configs": {"subj_number": {"config_name": "subj_number", "sample_row": "{\"sentence\": \"\\\"Coming from a xenophobic race that possesses the ...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "word_content": {"config_name": "word_content", "sample_row": "{\"sentence\": \"\\\"It just hadn 't seemed important, and he didn 't ...\", \"label\": \"552\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "obj_number": {"config_name": "obj_number", "sample_row": "{\"sentence\": \"\\\"Money would replace the drugs in the bags, once t...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "past_present": {"config_name": "past_present", "sample_row": "{\"sentence\": \"\\\"She shone her light around the space, following t...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "sentence_length": {"config_name": "sentence_length", "sample_row": "{\"sentence\": \"\\\"But it was not here.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "top_constituents": {"config_name": "top_constituents", "sample_row": "{\"sentence\": \"\\\"I wanted to start asking questions now, but force...\", \"label\": \"7\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "tree_depth": {"config_name": "tree_depth", "sample_row": "{\"sentence\": \"\\\"Who knew who would be there?\\\"\", \"label\": \"5\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "coordination_inversion": {"config_name": "coordination_inversion", "sample_row": "{\"sentence\": \"\\\"She was a regular at the Friday charity sessions,...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "odd_man_out": {"config_name": "odd_man_out", "sample_row": "{\"sentence\": \"\\\"Gideon brought his phone to his ear and resonated...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}, "bigram_shift": {"config_name": "bigram_shift", "sample_row": "{\"sentence\": \"\\\"A week she'd been with the man, just a week, and ...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["sentence", "label", "idx"], "columns_mapping": {"sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "10 probing tasks designed to capture simple linguistic features of sentences,\n", "dataset_name": "metaeval/linguisticprobing"}}, "tags": ["task_categories:text-classification", "annotations_creators:machine-generated", "multilinguality:monolingual", "language:en"], "is_gated": false}, "metaeval/recast": {"dataset_name": "metaeval/recast", "description": "A diverse collection of tasks recasted as natural language inference tasks.", "downloads": 83, "configs": {"recast_kg_relations": {"config_name": "recast_kg_relations", "sample_row": "{\"context\": \"\\\"Diplomats say Assad 's absence from the meeting a...\", \"hypothesis\": \"\\\"Assad was buried in Syria .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_puns": {"config_name": "recast_puns", "sample_row": "{\"context\": \"\\\"Michaela heard that the agreeable tennis umpire w...\", \"hypothesis\": \"\\\"Michaela heard a pun\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_factuality": {"config_name": "recast_factuality", "sample_row": "{\"context\": \"\\\"We had a larger black population in the 70s than ...\", \"hypothesis\": \"\\\"The having happened\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_verbnet": {"config_name": "recast_verbnet", "sample_row": "{\"context\": \"\\\"David constructed a house .\\\"\", \"hypothesis\": \"\\\"David caused the constructing .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_verbcorner": {"config_name": "recast_verbcorner", "sample_row": "{\"context\": \"\\\"Samantha enjoyed the blinch.\\\"\", \"hypothesis\": \"\\\"Something good happened .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_ner": {"config_name": "recast_ner", "sample_row": "{\"context\": \"\\\"Mexican President Felipe Calderon has sought more...\", \"hypothesis\": \"\\\"Mexican is a day of the week\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_sentiment": {"config_name": "recast_sentiment", "sample_row": "{\"context\": \"\\\"When asked about the product, Eniyah said, 'I had...\", \"hypothesis\": \"\\\"Eniyah liked the product . \\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}, "recast_megaveridicality": {"config_name": "recast_megaveridicality", "sample_row": "{\"context\": \"\\\"someone confirmed that a particular thing happene...\", \"hypothesis\": \"\\\"that thing happened .\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["context", "hypothesis", "label", "idx"], "columns_mapping": {"context": "context", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "A diverse collection of tasks recasted as natural language inference tasks.\n", "dataset_name": "metaeval/recast"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "nli", "natural-language-inference"], "is_gated": false}, "midas/inspec": {"dataset_name": "midas/inspec", "description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.", "downloads": 459, "configs": {"extraction": {"config_name": "extraction", "sample_row": "{\"id\": \"1001\", \"document\": \"[\\\"A\\\", \\\"conflict\\\", \\\"between\\\", \\\"language\\\", \\\"and\\\", \\\"a...\", \"doc_bio_tags\": \"[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\",...\"}", "columns": ["id", "document", "doc_bio_tags"], "columns_mapping": {"id": "id", "document": "document", "doc_bio_tags": "doc_bio_tags"}, "dataset_description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.\n", "dataset_name": "midas/inspec"}, "generation": {"config_name": "generation", "sample_row": "{\"id\": \"1001\", \"document\": \"[\\\"A\\\", \\\"conflict\\\", \\\"between\\\", \\\"language\\\", \\\"and\\\", \\\"a...\", \"extractive_keyphrases\": \"[\\\"philosophy of mind\\\", \\\"content atomism\\\", \\\"ibs\\\", \\\"...\", \"abstractive_keyphrases\": \"[\\\"information-based semantics\\\"]\"}", "columns": ["id", "document", "extractive_keyphrases", "abstractive_keyphrases"], "columns_mapping": {"id": "id", "document": "document", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases"}, "dataset_description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.\n", "dataset_name": "midas/inspec"}, "raw": {"config_name": "raw", "sample_row": "{\"id\": \"1001\", \"document\": \"[\\\"A\\\", \\\"conflict\\\", \\\"between\\\", \\\"language\\\", \\\"and\\\", \\\"a...\", \"doc_bio_tags\": \"[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\",...\", \"extractive_keyphrases\": \"[\\\"philosophy of mind\\\", \\\"content atomism\\\", \\\"ibs\\\", \\\"...\", \"abstractive_keyphrases\": \"[\\\"information-based semantics\\\"]\", \"other_metadata.text\": \"[]\", \"other_metadata.bio_tags\": \"[]\"}", "columns": ["id", "document", "doc_bio_tags", "extractive_keyphrases", "abstractive_keyphrases", "other_metadata_text", "other_metadata_bio_tags"], "columns_mapping": {"id": "id", "document": "document", "doc_bio_tags": "doc_bio_tags", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "other_metadata.text": "other_metadata_text", "other_metadata.bio_tags": "other_metadata_bio_tags"}, "dataset_description": "Benchmark dataset for automatic identification of keyphrases from text published with the work - Improved automatic keyword extraction given more linguistic knowledge. Anette Hulth. In Proceedings of EMNLP 2003. p. 216-223.\n", "dataset_name": "midas/inspec"}}, "tags": [], "is_gated": false}, "midas/ldkp10k": {"dataset_name": "midas/ldkp10k", "description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.", "downloads": 10, "configs": {"small": {"config_name": "small", "sample_row": "{\"id\": \"\\\"18980016\\\"\", \"sections\": \"[\\\"introduction\\\", \\\"application to diffraction grati...\", \"sec_text\": \"[[\\\"New\\\", \\\"and\\\", \\\"interesting\\\", \\\"theoretical\\\", \\\"cha...\", \"extractive_keyphrases\": \"[]\", \"abstractive_keyphrases\": \"[\\\"quantum mechanics\\\", \\\"quantum physics\\\"]\", \"sec_bio_tags\": \"[[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\"...\"}", "columns": ["id", "sections", "sec_text", "extractive_keyphrases", "abstractive_keyphrases", "sec_bio_tags"], "columns_mapping": {"id": "id", "sections": "sections", "sec_text": "sec_text", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "sec_bio_tags": "sec_bio_tags"}, "dataset_description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.\n", "dataset_name": "midas/ldkp10k"}, "medium": {"config_name": "medium", "sample_row": "{\"id\": \"\\\"18988100\\\"\", \"sections\": \"[\\\"introduction\\\", \\\"diffusion on ge(111)-c(2\\\\u00d78)...\", \"sec_text\": \"[[\\\"The\\\", \\\"successful\\\", \\\"production\\\", \\\"of\\\", \\\"electr...\", \"extractive_keyphrases\": \"[\\\"diffusion\\\", \\\"saddle point\\\", \\\"thin film\\\"]\", \"abstractive_keyphrases\": \"[\\\"materials science\\\", \\\"growth mechanism\\\", \\\"germani...\", \"sec_bio_tags\": \"[[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\"...\"}", "columns": ["id", "sections", "sec_text", "extractive_keyphrases", "abstractive_keyphrases", "sec_bio_tags"], "columns_mapping": {"id": "id", "sections": "sections", "sec_text": "sec_text", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "sec_bio_tags": "sec_bio_tags"}, "dataset_description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.\n", "dataset_name": "midas/ldkp10k"}, "large": {"config_name": "large", "sample_row": "{\"id\": \"\\\"18980258\\\"\", \"sections\": \"[\\\"visual sleep staging is still the most widely us...\", \"sec_text\": \"[[\\\"TO\\\", \\\"SUBDIVIDE\\\", \\\"SLEEP\\\", \\\"RECORDINGS\\\", \\\"INTO\\\"...\", \"extractive_keyphrases\": \"[\\\"classification\\\"]\", \"abstractive_keyphrases\": \"[\\\"sleep wake cycle\\\", \\\"electrodiagnosis\\\", \\\"electrop...\", \"sec_bio_tags\": \"[[\\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\"...\"}", "columns": ["id", "sections", "sec_text", "extractive_keyphrases", "abstractive_keyphrases", "sec_bio_tags"], "columns_mapping": {"id": "id", "sections": "sections", "sec_text": "sec_text", "extractive_keyphrases": "extractive_keyphrases", "abstractive_keyphrases": "abstractive_keyphrases", "sec_bio_tags": "sec_bio_tags"}, "dataset_description": "This new dataset is designed to solve kp NLP task and is crafted with a lot of care.\n", "dataset_name": "midas/ldkp10k"}}, "tags": [], "is_gated": false}, "mideind/icelandic-error-corpus-IceEC": {"dataset_name": "mideind/icelandic-error-corpus-IceEC", "description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.", "downloads": 32, "configs": {"fine-grained": {"config_name": "fine-grained", "sample_row": "{\"idx\": \"\\\"0\\\"\", \"sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\", \"errors\": \"[[], [], [], [], [], [], [], [], [], [], [], [], [...\", \"has_error\": \"false\", \"corrected_sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\"}", "columns": ["idx", "sentence", "errors", "has_error", "corrected_sentence"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "errors": "errors", "has_error": "has_error", "corrected_sentence": "corrected_sentence"}, "dataset_description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.\n", "dataset_name": "mideind/icelandic-error-corpus-IceEC"}, "subcategory": {"config_name": "subcategory", "sample_row": "{\"idx\": \"\\\"0\\\"\", \"sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\", \"errors\": \"[[], [], [], [], [], [], [], [], [], [], [], [], [...\", \"has_error\": \"false\", \"corrected_sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\"}", "columns": ["idx", "sentence", "errors", "has_error", "corrected_sentence"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "errors": "errors", "has_error": "has_error", "corrected_sentence": "corrected_sentence"}, "dataset_description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.\n", "dataset_name": "mideind/icelandic-error-corpus-IceEC"}, "category": {"config_name": "category", "sample_row": "{\"idx\": \"\\\"0\\\"\", \"sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\", \"errors\": \"[[], [], [], [], [], [], [], [], [], [], [], [], [...\", \"has_error\": \"false\", \"corrected_sentence\": \"[\\\"Jack\\\", \\\"Live\\\", \\\"kv\\\\u00f6ldin\\\", \\\"halda\\\", \\\"g\\\\u00f6...\"}", "columns": ["idx", "sentence", "errors", "has_error", "corrected_sentence"], "columns_mapping": {"idx": "idx", "sentence": "sentence", "errors": "errors", "has_error": "has_error", "corrected_sentence": "corrected_sentence"}, "dataset_description": "The Icelandic Error Corpus (IceEC) is a collection of texts in modern Icelandic annotated for mistakes related to spelling, grammar, and other issues. The texts are organized by genre. The current version includes sentences from student essays, online news texts and Wikipedia articles.\nSentences within texts in the student essays had to be shuffled due to the license which they were originally published under, but neither the online news texts nor the Wikipedia articles needed to be shuffled.\n", "dataset_name": "mideind/icelandic-error-corpus-IceEC"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:is"], "is_gated": false}, "ml6team/cnn_dailymail_nl": {"dataset_name": "ml6team/cnn_dailymail_nl", "description": " This dataset is the CNN/Dailymail dataset translated to Dutch.\n This is the original dataset:\n ```\n load_dataset(\"cnn_dailymail\", '3.0.0')\n ```\n And this is the HuggingFace translation pipeline:\n ```\n pipeline(\n task='translation_en_to_nl',\n model='Helsinki-NLP/opus-mt-en-nl',\n tokenizer='Helsinki-NLP/opus-mt-en-nl')\n ```", "downloads": 50, "configs": {"default": {"config_name": "default", "sample_row": "{\"article\": \"\\\"(CNN) -- de bewering van de Amerikaanse minister ...\", \"highlights\": \"\\\"Anti-terrorisme beleid leeft op de rand van het i...\", \"id\": \"\\\"0d8f8bad4680a1ab57197f60923e8cf71c748d6f\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": " This dataset is the CNN/Dailymail dataset translated to Dutch.\n This is the original dataset:\n ```\n load_dataset(\"cnn_dailymail\", '3.0.0')\n ```\n And this is the HuggingFace translation pipeline:\n ```\n pipeline(\n task='translation_en_to_nl',\n model='Helsinki-NLP/opus-mt-en-nl',\n tokenizer='Helsinki-NLP/opus-mt-en-nl')\n ```\n", "dataset_name": "ml6team/cnn_dailymail_nl"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:https://github.com/huggingface/datasets/tree/master/datasets/cnn_dailymail", "language:nl"], "is_gated": false}, "indonesian-nlp/mc4-id": {"dataset_name": "indonesian-nlp/mc4-id", "description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "downloads": 72, "configs": {"tiny": {"config_name": "tiny", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "medium": {"config_name": "medium", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "large": {"config_name": "large", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}, "full": {"config_name": "full", "sample_row": "{\"text\": \"\\\"Rentetan Program Kegiatan Prudential yang Harus A...\", \"timestamp\": \"\\\"2020-01-23T09:19:49Z\\\"\", \"url\": \"\\\"https://www.prudential.co.id/id/Informasi-untuk-A...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Italian portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "indonesian-nlp/mc4-id"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended", "language:id"], "is_gated": false}, "mvarma/medwiki": {"dataset_name": "mvarma/medwiki", "description": "MedWiki is a large-scale sentence dataset collected from Wikipedia with medical entity (UMLS) annotations. This dataset is intended for pretraining.", "downloads": 37, "configs": {"medwiki_full": {"config_name": "medwiki_full", "sample_row": "{\"mentions\": \"[\\\"dahl\\\", \\\"dahl\\\", \\\"dahl\\\"]\", \"entities\": \"[\\\"C0600533\\\", \\\"C0600533\\\", \\\"C0600533\\\"]\", \"entity_titles\": \"[\\\"Rats, Inbred Dahl\\\", \\\"Rats, Inbred Dahl\\\", \\\"Rats, ...\", \"types\": \"[[\\\"Mammal\\\", \\\"writer\\\", \\\"poet\\\", \\\"screenwriter\\\", \\\"aut...\", \"spans\": \"[[10, 11], [12, 14], [16, 18]]\", \"sentence\": \"\\\"Receiving the 1983 World Fantasy Award for Life A...\", \"sent_idx_unq\": \"44000000\"}", "columns": ["mentions", "entities", "entity_titles", "types", "spans", "sentence", "sent_idx_unq"], "columns_mapping": {"mentions": "mentions", "entities": "entities", "entity_titles": "entity_titles", "types": "types", "spans": "spans", "sentence": "sentence", "sent_idx_unq": "sent_idx_unq"}, "dataset_description": "MedWiki is a large-scale sentence dataset collected from Wikipedia with medical entity (UMLS) annotations. This dataset is intended for pretraining.\n", "dataset_name": "mvarma/medwiki"}, "medwiki_hq": {"config_name": "medwiki_hq", "sample_row": "{\"mentions\": \"[\\\"czechoslovakia\\\"]\", \"entities\": \"[\\\"C0010872\\\"]\", \"entity_titles\": \"[\\\"Czechoslovakia\\\"]\", \"types\": \"[[\\\"Geographic Area\\\", \\\"historical country\\\", \\\"sovere...\", \"spans\": \"[[10, 11]]\", \"sentence\": \"\\\"The Czechoslovakia men 's national under-21 volle...\", \"sent_idx_unq\": \"44000003\"}", "columns": ["mentions", "entities", "entity_titles", "types", "spans", "sentence", "sent_idx_unq"], "columns_mapping": {"mentions": "mentions", "entities": "entities", "entity_titles": "entity_titles", "types": "types", "spans": "spans", "sentence": "sentence", "sent_idx_unq": "sent_idx_unq"}, "dataset_description": "MedWiki is a large-scale sentence dataset collected from Wikipedia with medical entity (UMLS) annotations. This dataset is intended for pretraining.\n", "dataset_name": "mvarma/medwiki"}}, "tags": ["task_categories:text-retrieval", "task_ids:entity-linking-retrieval", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|wikipedia"], "is_gated": false}, "ought/raft": {"dataset_name": "ought/raft", "description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)", "downloads": 15611, "configs": {"ade_corpus_v2": {"config_name": "ade_corpus_v2", "sample_row": "{\"Sentence\": \"\\\"No regional side effects were noted.\\\"\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Sentence", "ID", "Label"], "columns_mapping": {"Sentence": "Sentence", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "banking_77": {"config_name": "banking_77", "sample_row": "{\"Query\": \"\\\"Is it possible for me to change my PIN number?\\\"...\", \"ID\": \"0\", \"Label\": \"23\"}", "columns": ["Query", "ID", "Label"], "columns_mapping": {"Query": "Query", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "terms_of_service": {"config_name": "terms_of_service", "sample_row": "{\"Sentence\": \"\\\"Crowdtangle may change these terms of service, as...\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Sentence", "ID", "Label"], "columns_mapping": {"Sentence": "Sentence", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "tai_safety_research": {"config_name": "tai_safety_research", "sample_row": "{\"Title\": \"\\\"Malign generalization without internal search\\\"\", \"Abstract Note\": \"\\\"In my last post, I challenged the idea that inner...\", \"Url\": \"\\\"https://www.alignmentforum.org/posts/ynt9TD6PrYw6...\", \"Publication Year\": \"\\\"2020\\\"\", \"Item Type\": \"\\\"blogPost\\\"\", \"Author\": \"\\\"Barnett, Matthew\\\"\", \"Publication Title\": \"\\\"AI Alignment Forum\\\"\", \"ID\": \"0\", \"Label\": \"1\"}", "columns": ["Title", "Abstract Note", "Url", "Publication Year", "Item Type", "Author", "Publication Title", "ID", "Label"], "columns_mapping": {"Title": "Title", "Abstract Note": "Abstract Note", "Url": "Url", "Publication Year": "Publication Year", "Item Type": "Item Type", "Author": "Author", "Publication Title": "Publication Title", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "neurips_impact_statement_risks": {"config_name": "neurips_impact_statement_risks", "sample_row": "{\"Paper title\": \"\\\"Auto-Panoptic: Cooperative Multi-Component Archit...\", \"Paper link\": \"\\\"https://proceedings.neurips.cc/paper/2020/file/ec...\", \"Impact statement\": \"\\\"This work makes the first attempt to search for a...\", \"ID\": \"0\", \"Label\": \"1\"}", "columns": ["Paper title", "Paper link", "Impact statement", "ID", "Label"], "columns_mapping": {"Paper title": "Paper title", "Paper link": "Paper link", "Impact statement": "Impact statement", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "overruling": {"config_name": "overruling", "sample_row": "{\"Sentence\": \"\\\"in light of both our holding today and previous r...\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Sentence", "ID", "Label"], "columns_mapping": {"Sentence": "Sentence", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "systematic_review_inclusion": {"config_name": "systematic_review_inclusion", "sample_row": "{\"Title\": \"\\\"Prototyping and transforming facial textures for ...\", \"Abstract\": \"\\\"Wavelet based methods for prototyping facial text...\", \"Authors\": \"\\\"Tiddeman, B.; Burt, M.; Perrett, D.\\\"\", \"Journal\": \"\\\"IEEE Comput Graphics Appl\\\"\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Title", "Abstract", "Authors", "Journal", "ID", "Label"], "columns_mapping": {"Title": "Title", "Abstract": "Abstract", "Authors": "Authors", "Journal": "Journal", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "one_stop_english": {"config_name": "one_stop_english", "sample_row": "{\"Article\": \"\\\"For 85 years, it was just a grey blob on classroo...\", \"ID\": \"0\", \"Label\": \"3\"}", "columns": ["Article", "ID", "Label"], "columns_mapping": {"Article": "Article", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "tweet_eval_hate": {"config_name": "tweet_eval_hate", "sample_row": "{\"Tweet\": \"\\\"New to Twitter-- any men on here know what the pr...\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Tweet", "ID", "Label"], "columns_mapping": {"Tweet": "Tweet", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "twitter_complaints": {"config_name": "twitter_complaints", "sample_row": "{\"Tweet text\": \"\\\"@HMRCcustomers No this is my first job\\\"\", \"ID\": \"0\", \"Label\": \"2\"}", "columns": ["Tweet text", "ID", "Label"], "columns_mapping": {"Tweet text": "Tweet text", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}, "semiconductor_org_types": {"config_name": "semiconductor_org_types", "sample_row": "{\"Paper title\": \"\\\"3Gb/s AC-coupled chip-to-chip communication using...\", \"Organization name\": \"\\\"North Carolina State Univ.,Raleigh,NC,USA\\\"\", \"ID\": \"0\", \"Label\": \"3\"}", "columns": ["Paper title", "Organization name", "ID", "Label"], "columns_mapping": {"Paper title": "Paper title", "Organization name": "Organization name", "ID": "ID", "Label": "Label"}, "dataset_description": "Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? \n\n[RAFT](https://raft.elicit.org) is a few-shot classification benchmark that tests language models:\n\n- across multiple domains (lit review, tweets, customer interaction, etc.)\n- on economically valuable classification tasks (someone inherently cares about the task)\n- in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)\n", "dataset_name": "ought/raft"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "source_datasets:extended|ade_corpus_v2", "source_datasets:extended|banking77", "language:en"], "is_gated": false}, "pasinit/xlwic": {"dataset_name": "pasinit/xlwic", "description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)", "downloads": 336, "configs": {"xlwic_en_bg": {"config_name": "xlwic_en_bg", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_zh": {"config_name": "xlwic_en_zh", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_hr": {"config_name": "xlwic_en_hr", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_da": {"config_name": "xlwic_en_da", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_nl": {"config_name": "xlwic_en_nl", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_et": {"config_name": "xlwic_en_et", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_fa": {"config_name": "xlwic_en_fa", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_ja": {"config_name": "xlwic_en_ja", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_ko": {"config_name": "xlwic_en_ko", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_it": {"config_name": "xlwic_en_it", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_fr": {"config_name": "xlwic_en_fr", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_en_de": {"config_name": "xlwic_en_de", "sample_row": "{\"id\": \"\\\"EN_0\\\"\", \"context_1\": \"\\\"Approach a task.\\\"\", \"context_2\": \"\\\"To approach the city.\\\"\", \"target_word\": \"\\\"approach\\\"\", \"pos\": \"\\\"V\\\"\", \"target_word_location_1.char_start\": \"0\", \"target_word_location_1.char_end\": \"8\", \"target_word_location_2.char_start\": \"3\", \"target_word_location_2.char_end\": \"11\", \"language\": \"\\\"EN\\\"\", \"label\": \"0\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_it_it": {"config_name": "xlwic_it_it", "sample_row": "{\"id\": \"\\\"IT_0\\\"\", \"context_1\": \"\\\"Improvvisamente il padre di famiglia: \\\\\\\"Tesoro, a...\", \"context_2\": \"\\\"C'\\\\u00e8 un progetto per Milano, a Segrate, per u...\", \"target_word\": \"\\\"centro commerciale\\\"\", \"pos\": \"\\\"N\\\"\", \"target_word_location_1.char_start\": \"58\", \"target_word_location_1.char_end\": \"76\", \"target_word_location_2.char_start\": \"46\", \"target_word_location_2.char_end\": \"64\", \"language\": \"\\\"IT\\\"\", \"label\": \"1\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_fr_fr": {"config_name": "xlwic_fr_fr", "sample_row": "{\"id\": \"\\\"FR_0\\\"\", \"context_1\": \"\\\"Comme l'indique le Shuowen Jiezi (dans son commen...\", \"context_2\": \"\\\"D\\\\u2019apr\\\\u00e8s le dictionnaire \\\\u00e9tymologiq...\", \"target_word\": \"\\\"ShuoWen\\\"\", \"pos\": \"\\\"N\\\"\", \"target_word_location_1.char_start\": \"19\", \"target_word_location_1.char_end\": \"26\", \"target_word_location_2.char_start\": \"37\", \"target_word_location_2.char_end\": \"44\", \"language\": \"\\\"FR\\\"\", \"label\": \"1\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}, "xlwic_de_de": {"config_name": "xlwic_de_de", "sample_row": "{\"id\": \"\\\"DE_0\\\"\", \"context_1\": \"\\\"Herr Starke wollte uns kein Interview geben.\\\"\", \"context_2\": \"\\\"Das kann ich dir aber sagen: Wenn die Frau Starke...\", \"target_word\": \"\\\"Starke\\\"\", \"pos\": \"\\\"N\\\"\", \"target_word_location_1.char_start\": \"5\", \"target_word_location_1.char_end\": \"11\", \"target_word_location_2.char_start\": \"43\", \"target_word_location_2.char_end\": \"49\", \"language\": \"\\\"DE\\\"\", \"label\": \"1\"}", "columns": ["id", "context_1", "context_2", "target_word", "pos", "target_word_location_1_char_start", "target_word_location_1_char_end", "target_word_location_2_char_start", "target_word_location_2_char_end", "language", "label"], "columns_mapping": {"id": "id", "context_1": "context_1", "context_2": "context_2", "target_word": "target_word", "pos": "pos", "target_word_location_1.char_start": "target_word_location_1_char_start", "target_word_location_1.char_end": "target_word_location_1_char_end", "target_word_location_2.char_start": "target_word_location_2_char_start", "target_word_location_2.char_end": "target_word_location_2_char_end", "language": "language", "label": "label"}, "dataset_description": "A system's task on any of the XL-WiC datasets is to identify the intended meaning of a word in a context of a given language. XL-WiC is framed as a binary classification task. Each instance in XL-WiC has a target word w, either a verb or a noun, for which two contexts are provided. Each of these contexts triggers a specific meaning of w. The task is to identify if the occurrences of w in the two contexts correspond to the same meaning or not.\n\nXL-WiC provides dev and test sets in the following 12 languages:\n\nBulgarian (BG)\nDanish (DA)\nGerman (DE)\nEstonian (ET)\nFarsi (FA)\nFrench (FR)\nCroatian (HR)\nItalian (IT)\nJapanese (JA)\nKorean (KO)\nDutch (NL)\nChinese (ZH)\nand training sets in the following 3 languages:\n\nGerman (DE)\nFrench (FR)\nItalian (IT)\n", "dataset_name": "pasinit/xlwic"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:bg", "language:zh", "language:hr", "language:da", "language:nl", "language:et", "language:fa", "language:ja", "language:ko", "language:it", "language:fr", "language:de"], "is_gated": false}, "peixian/equity_evaluation_corpus": {"dataset_name": "peixian/equity_evaluation_corpus", "description": "Automatic machine learning systems can inadvertently accentuate and perpetuate inappropriate human biases. Past work on examining inappropriate biases has largely focused on just individual systems and resources. Further, there is a lack of benchmark datasets for examining inappropriate biases in system predictions. Here, we present the Equity Evaluation Corpus (EEC), which consists of 8,640 English sentences carefully chosen to tease out biases towards certain races and genders. We used the dataset to examine 219 automatic sentiment analysis systems that took part in a recent shared task, SemEval-2018 Task 1 \u2018Affect in Tweets\u2019. We found that several of the systems showed statistically significant bias; that is, they consistently provide slightly higher sentiment intensity predictions for one race or one gender. We make the EEC freely available, and encourage its use to evaluate biases in sentiment and other NLP tasks.", "downloads": 35, "configs": {"first_domain": {"config_name": "first_domain", "sample_row": "{\"sentence\": \"\\\"Alonzo feels angry.\\\"\", \"template\": \"\\\" feels .\\\"\", \"person\": \"\\\"Alonzo\\\"\", \"gender\": \"\\\"male\\\"\", \"race\": \"\\\"African-American\\\"\", \"emotion\": \"\\\"anger\\\"\", \"emotion word\": \"\\\"angry\\\"\"}", "columns": ["sentence", "template", "person", "gender", "race", "emotion", "emotion word"], "columns_mapping": {"sentence": "sentence", "template": "template", "person": "person", "gender": "gender", "race": "race", "emotion": "emotion", "emotion word": "emotion word"}, "dataset_description": "Automatic machine learning systems can inadvertently accentuate and perpetuate inappropriate human biases. Past work on examining inappropriate biases has largely focused on just individual systems and resources. Further, there is a lack of benchmark datasets for examining inappropriate biases in system predictions. Here, we present the Equity Evaluation Corpus (EEC), which consists of 8,640 English sentences carefully chosen to tease out biases towards certain races and genders. We used the dataset to examine 219 automatic sentiment analysis systems that took part in a recent shared task, SemEval-2018 Task 1 \u2018Affect in Tweets\u2019. We found that several of the systems showed statistically significant bias; that is, they consistently provide slightly higher sentiment intensity predictions for one race or one gender. We make the EEC freely available, and encourage its use to evaluate biases in sentiment and other NLP tasks.\n", "dataset_name": "peixian/equity_evaluation_corpus"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "gender-classification"], "is_gated": false}, "persiannlp/parsinlu_entailment": {"dataset_name": "persiannlp/parsinlu_entailment", "description": "A Persian textual entailment task (deciding `sent1` entails `sent2`).", "downloads": 56, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"sent1\": \"\\\"\\\\u0632\\\\u0646\\\\u0627\\\\u0646 \\\\u0628\\\\u0647 \\\\u0642\\\\u062...\", \"sent2\": \"\\\"\\\\u0645\\\\u0631\\\\u062f\\\\u0627\\\\u0646 \\\\u0628\\\\u062e\\\\u0634...\", \"category\": \"\\\"translation-train\\\"\", \"label\": \"\\\"c\\\"\"}", "columns": ["sent1", "sent2", "category", "label"], "columns_mapping": {"sent1": "sent1", "sent2": "sent2", "category": "category", "label": "label"}, "dataset_description": "A Persian textual entailment task (deciding `sent1` entails `sent2`). \n", "dataset_name": "persiannlp/parsinlu_entailment"}}, "tags": ["task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|translated|mnli", "language:fa"], "is_gated": false}, "persiannlp/parsinlu_query_paraphrasing": {"dataset_name": "persiannlp/parsinlu_query_paraphrasing", "description": "A Persian query paraphrasing task (paraphrase or not, given two questions). \nThe questions are partly mined using Google auto-complete, and partly translated from Quora paraphrasing dataset.", "downloads": 20, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"q1\": \"\\\"\\\\u0686\\\\u06af\\\\u0648\\\\u0646\\\\u0647 \\\\u0645\\\\u06cc \\\\u062...\", \"q2\": \"\\\"\\\\u0686\\\\u06af\\\\u0648\\\\u0646\\\\u0647 \\\\u0648\\\\u0632\\\\u0646...\", \"category\": \"\\\"qqp\\\"\", \"label\": \"\\\"1\\\"\"}", "columns": ["q1", "q2", "category", "label"], "columns_mapping": {"q1": "q1", "q2": "q2", "category": "category", "label": "label"}, "dataset_description": "A Persian query paraphrasing task (paraphrase or not, given two questions). \nThe questions are partly mined using Google auto-complete, and partly translated from Quora paraphrasing dataset. \n", "dataset_name": "persiannlp/parsinlu_query_paraphrasing"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|quora|google", "language:fa"], "is_gated": false}, "persiannlp/parsinlu_sentiment": {"dataset_name": "persiannlp/parsinlu_sentiment", "description": "A Persian sentiment analysis task (deciding whether a given sentence contains a particular sentiment).", "downloads": 61, "configs": {"parsinlu-repo": {"config_name": "parsinlu-repo", "sample_row": "{\"review\": \"\\\"\\\\u062f\\\\u0648\\\\u0633\\\\u062a\\\\u0627\\\\u0646 \\\\u062d\\\\u062a...\", \"review_id\": \"\\\"1\\\"\", \"example_id\": \"\\\"1\\\"\", \"excel_id\": \"\\\"food_1744\\\"\", \"question\": \"\\\"\\\\u0646\\\\u0638\\\\u0631 \\\\u0634\\\\u0645\\\\u0627 \\\\u062f\\\\u063...\", \"category\": \"\\\"\\\\u06af\\\\u0648\\\\u0634\\\\u062a \\\\u0645\\\\u0631\\\\u063a\\\"\", \"aspect\": \"\\\"\\\\u0637\\\\u0639\\\\u0645\\\"\", \"label\": \"\\\"-3\\\"\", \"guid\": \"\\\"food-train-r1-e1\\\"\"}", "columns": ["review", "review_id", "example_id", "excel_id", "question", "category", "aspect", "label", "guid"], "columns_mapping": {"review": "review", "review_id": "review_id", "example_id": "example_id", "excel_id": "excel_id", "question": "question", "category": "category", "aspect": "aspect", "label": "label", "guid": "guid"}, "dataset_description": "A Persian sentiment analysis task (deciding whether a given sentence contains a particular sentiment). \n", "dataset_name": "persiannlp/parsinlu_sentiment"}}, "tags": ["task_ids:sentiment-analysis", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|translated|mnli", "language:fa"], "is_gated": false}, "projecte-aina/ancora-ca-ner": {"dataset_name": "projecte-aina/ancora-ca-ner", "description": "AnCora Catalan NER.\n This is a dataset for Named Eentity Reacognition (NER) from Ancora corpus adapted for \n Machine Learning and Language Model evaluation purposes.\n Since multiwords (including Named Entites) in the original Ancora corpus are aggregated as \n a single lexical item using underscores (e.g. \"Ajuntament_de_Barcelona\") \n we splitted them to align with word-per-line format, and added conventional Begin-Inside-Outside (IOB)\n tags to mark and classify Named Entites. \n We did not filter out the different categories of NEs from Ancora (weak and strong). \n We did 6 minor edits by hand.\n AnCora corpus is used under [CC-by] (https://creativecommons.org/licenses/by/4.0/) licence.\n This dataset was developed by BSC TeMU as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).", "downloads": 42, "configs": {"AncoraCaNer": {"config_name": "AncoraCaNer", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Una\\\", \\\"setantena\\\", \\\"de\\\", \\\"treballadors\\\", \\\"de\\\", \\\"...\", \"ner_tags\": \"[8, 8, 8, 8, 8, 2, 6, 6, 6, 6, 8, 0, 8, 8, 8, 8, 8...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "AnCora Catalan NER.\n This is a dataset for Named Eentity Reacognition (NER) from Ancora corpus adapted for \n Machine Learning and Language Model evaluation purposes.\n Since multiwords (including Named Entites) in the original Ancora corpus are aggregated as \n a single lexical item using underscores (e.g. \"Ajuntament_de_Barcelona\") \n we splitted them to align with word-per-line format, and added conventional Begin-Inside-Outside (IOB)\n tags to mark and classify Named Entites. \n We did not filter out the different categories of NEs from Ancora (weak and strong). \n We did 6 minor edits by hand.\n AnCora corpus is used under [CC-by] (https://creativecommons.org/licenses/by/4.0/) licence.\n This dataset was developed by BSC TeMU as part of the AINA project, and to enrich the Catalan Language Understanding Benchmark (CLUB).\n ", "dataset_name": "projecte-aina/ancora-ca-ner"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "language:ca"], "is_gated": false}, "projecte-aina/casum": {"dataset_name": "projecte-aina/casum", "description": "CaSum is a summarization dataset. It is extracted from a newswire corpus crawled from the Catalan News Agency. The corpus consists of 217,735 instances that are composed by the headline and the body.", "downloads": 50, "configs": {"CaSum": {"config_name": "CaSum", "sample_row": "{\"summary\": \"\\\"El Govern convoca eleccions a la presid\\\\u00e8ncia...\", \"text\": \"\\\"El Govern ha aprovat aquest dimarts el decret pel...\"}", "columns": ["summary", "text"], "columns_mapping": {"summary": "summary", "text": "text"}, "dataset_description": "CaSum is a summarization dataset. It is extracted from a newswire corpus crawled from the Catalan News Agency. The corpus consists of 217,735 instances that are composed by the headline and the body.\n", "dataset_name": "projecte-aina/casum"}}, "tags": ["task_categories:summarization", "annotations_creators:machine-generated", "multilinguality:monolingual", "language:ca"], "is_gated": false}, "projecte-aina/viquiquad": {"dataset_name": "projecte-aina/viquiquad", "description": "ViquiQuAD: an extractive QA dataset from Catalan Wikipedia.\nThis dataset contains 3111 contexts extracted from a set of 597 high quality original (no translations) \narticles in the Catalan Wikipedia \"Viquip\u00e8dia\" (ca.wikipedia.org), and 1 to 5 questions with their\nanswer for each fragment. Viquipedia articles are used under CC-by-sa licence. \nThis dataset can be used to build extractive-QA and Language Models.\nFunded by the Generalitat de Catalunya, Departament de Pol\u00edtiques Digitals i Administraci\u00f3 P\u00fablica (AINA),\nMT4ALL and Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).", "downloads": 69, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"P_66_C_391_Q1\\\"\", \"title\": \"\\\"Xavier Miserachs i Ribalta\\\"\", \"context\": \"\\\"En aquesta \\\\u00e8poca es va consolidar el concept...\", \"question\": \"\\\"De qu\\\\u00e8 es diferenciava el reportatge fotogr\\\\...\", \"answers\": \"[{\\\"text\\\": \\\"del fotoperiodisme[n. 2] i de la fotogr...\"}", "columns": ["id", "title", "context", "question", "answers"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers": "answers"}, "dataset_description": "ViquiQuAD: an extractive QA dataset from Catalan Wikipedia.\nThis dataset contains 3111 contexts extracted from a set of 597 high quality original (no translations) \narticles in the Catalan Wikipedia \"Viquip\u00e8dia\" (ca.wikipedia.org), and 1 to 5 questions with their\nanswer for each fragment. Viquipedia articles are used under CC-by-sa licence. \nThis dataset can be used to build extractive-QA and Language Models.\nFunded by the Generalitat de Catalunya, Departament de Pol\u00edtiques Digitals i Administraci\u00f3 P\u00fablica (AINA),\nMT4ALL and Plan de Impulso de las Tecnolog\u00edas del Lenguaje (Plan TL).\n", "dataset_name": "projecte-aina/viquiquad"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ca"], "is_gated": false}, "projecte-aina/wnli-ca": {"dataset_name": "projecte-aina/wnli-ca", "description": "professional translation into Catalan of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).", "downloads": 22, "configs": {"winograd": {"config_name": "winograd", "sample_row": "{\"sentence1\": \"\\\"Vaig clavar una agulla en una pastanaga. Quan la ...\", \"sentence2\": \"\\\"La pastanaga tenia un forat.\\\"\", \"label\": \"1\"}", "columns": ["sentence1", "sentence2", "label"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "\n professional translation into Catalan of Winograd NLI dataset as published in GLUE Benchmark.\n The Winograd NLI dataset presents 855 sentence pairs, \n in which the first sentence contains an ambiguity and the second one a possible interpretation of it. \n The label indicates if the interpretation is correct (1) or not (0).\n ", "dataset_name": "projecte-aina/wnli-ca"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|glue", "language:ca"], "is_gated": false}, "qanastek/WMT-16-PubMed": {"dataset_name": "qanastek/WMT-16-PubMed", "description": "WMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html", "downloads": 27, "configs": {"en-pt": {"config_name": "en-pt", "sample_row": "{\"translation.en\": \"\\\"Inequalities in self-rated health: an analysis of...\", \"translation.pt\": \"\\\"ERRATA\\\"\"}", "columns": ["translation_en", "translation_pt"], "columns_mapping": {"translation.en": "translation_en", "translation.pt": "translation_pt"}, "dataset_description": "\nWMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html\n", "dataset_name": "qanastek/WMT-16-PubMed"}, "en-es": {"config_name": "en-es", "sample_row": "{\"translation.en\": \"\\\"Cruising and e-dates: a new context for sexual en...\", \"translation.es\": \"\\\"Cruising y e-citas: un nuevo contexto para los en...\"}", "columns": ["translation_en", "translation_es"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "\nWMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html\n", "dataset_name": "qanastek/WMT-16-PubMed"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"translation.en\": \"\\\"Global Health: Where Do Physiotherapy and Rehabil...\", \"translation.fr\": \"\\\"La place des cheveux et des poils dans les rituel...\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "\nWMT'16 Biomedical Translation Task - PubMed parallel datasets\nhttp://www.statmt.org/wmt16/biomedical-translation-task.html\n", "dataset_name": "qanastek/WMT-16-PubMed"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:extended", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "qwant/squad_fr": {"dataset_name": "qwant/squad_fr", "description": "SQuAD-fr is a French translated version of the Stanford Question Answering Dataset (SQuAD), the reference corpus to evaluate question answering models' performances in English.\nIt consists of 100K question-answer pairs on 500+ articles derived from the original English dataset and represents a large-scale dataset for closed-domain question answering on factoid questions in French.\nSQuAD-fr serves as a means of data augmentation on FQuAD and PIAF benchmarks, with 90K+ translated training pairs.", "downloads": 125, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"572ed956c246551400ce471c\\\"\", \"title\": \"\\\"Transistor\\\"\", \"context\": \"\\\"Un transistor est un dispositif semi-conducteur u...\", \"question\": \"\\\"Quelle est l'utilisation d'un transistor ?\\\"\", \"answers.text\": \"[\\\"amplifier ou commuter les signaux \\\\u00e9lectroni...\", \"answers.answer_start\": \"[61]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "SQuAD-fr is a French translated version of the Stanford Question Answering Dataset (SQuAD), the reference corpus to evaluate question answering models' performances in English.\nIt consists of 100K question-answer pairs on 500+ articles derived from the original English dataset and represents a large-scale dataset for closed-domain question answering on factoid questions in French.\nSQuAD-fr serves as a means of data augmentation on FQuAD and PIAF benchmarks, with 90K+ translated training pairs.\n", "dataset_name": "qwant/squad_fr"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:closed-domain-qa", "annotations_creators:machine-generated", "multilinguality:monolingual", "multilinguality:translation", "source_datasets:extended|squad", "language:fr"], "is_gated": false}, "ramybaly/conll2012": {"dataset_name": "ramybaly/conll2012", "description": "The CoNLL-2012 shared task involved predicting coreference in English, Chinese, and Arabic, using the final version, v5.0,\nof the OntoNotes corpus. It was a follow-on to the English-only task organized in 2011. Until the creation of the OntoNotes\ncorpus, resources in this sub-field of language processing were limited to noun phrase coreference, often on a restricted\nset of entities, such as the ACE entities. OntoNotes provides a large-scale corpus of general anaphoric coreference not\nrestricted to noun phrases or to a specified set of entity types, and covers multiple languages. OntoNotes also provides\nadditional layers of integrated annotation, capturing additional shallow semantic structure. This paper describes the\nOntoNotes annotation (coreference and other layers) and then describes the parameters of the shared task including the\nformat, pre-processing information, evaluation criteria, and presents and discusses the results achieved by the participating\nsystems. The task of coreference has had a complex evaluation history. Potentially many evaluation conditions, have, in the past,\nmade it difficult to judge the improvement in new algorithms over previously reported results. Having a standard test set\nand standard evaluation parameters, all based on a resource that provides multiple integrated annotation layers (syntactic\nparses, semantic roles, word senses, named entities and coreference) and in multiple languages could support joint modeling\nand help ground and energize ongoing research in the task of entity and event coreference.\nFor more details see https://aclanthology.org/W12-4501.pdf", "downloads": 10, "configs": {"conll2012": {"config_name": "conll2012", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Big\\\", \\\"Managers\\\", \\\"on\\\", \\\"Campus\\\"]\", \"pos_tags\": \"[17, 26, 16, 23]\", \"tags\": \"[0, 0, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "tags": "tags"}, "dataset_description": "The CoNLL-2012 shared task involved predicting coreference in English, Chinese, and Arabic, using the final version, v5.0,\nof the OntoNotes corpus. It was a follow-on to the English-only task organized in 2011. Until the creation of the OntoNotes\ncorpus, resources in this sub-field of language processing were limited to noun phrase coreference, often on a restricted\nset of entities, such as the ACE entities. OntoNotes provides a large-scale corpus of general anaphoric coreference not\nrestricted to noun phrases or to a specified set of entity types, and covers multiple languages. OntoNotes also provides\nadditional layers of integrated annotation, capturing additional shallow semantic structure. This paper describes the\nOntoNotes annotation (coreference and other layers) and then describes the parameters of the shared task including the\nformat, pre-processing information, evaluation criteria, and presents and discusses the results achieved by the participating\nsystems. The task of coreference has had a complex evaluation history. Potentially many evaluation conditions, have, in the past,\nmade it difficult to judge the improvement in new algorithms over previously reported results. Having a standard test set\nand standard evaluation parameters, all based on a resource that provides multiple integrated annotation layers (syntactic\nparses, semantic roles, word senses, named entities and coreference) and in multiple languages could support joint modeling\nand help ground and energize ongoing research in the task of entity and event coreference.\nFor more details see https://aclanthology.org/W12-4501.pdf\n", "dataset_name": "ramybaly/conll2012"}}, "tags": [], "is_gated": false}, "sagteam/author_profiling": {"dataset_name": "sagteam/author_profiling", "description": "he corpus for the author profiling analysis contains texts in Russian-language which labeled for 5 tasks:\n1) gender -- 13530 texts with the labels, who wrote this: text female or male;\n2) age -- 13530 texts with the labels, how old the person who wrote the text. This is a number from 12 to 80. In addition, for the classification task we added 5 age groups: 1-19; 20-29; 30-39; 40-49; 50+;\n3) age imitation -- 7574 texts, where crowdsource authors is asked to write three texts: \n a) in their natural manner, \n b) imitating the style of someone younger, \n c) imitating the style of someone older;\n4) gender imitation -- 5956 texts, where the crowdsource authors is asked to write texts: in their origin gender and pretending to be the opposite gender;\n5) style imitation -- 5956 texts, where crowdsource authors is asked to write a text on behalf of another person of your own gender, with a distortion of the authors usual style.", "downloads": 14, "configs": {"main": {"config_name": "main", "sample_row": "{\"id\": \"\\\"crowdsource_4\\\"\", \"text\": \"\\\"\\\\u0437\\\\u0434\\\\u0440\\\\u0430\\\\u0432\\\\u0441\\\\u0442\\\\u0432\\\\...\", \"account_id\": \"\\\"account_#1009\\\"\", \"author_id\": \"2\", \"age\": \"21\", \"age_group\": \"\\\"20-29\\\"\", \"gender\": \"\\\"male\\\"\", \"no_imitation\": \"\\\"no_any_imitation\\\"\", \"age_imitation\": \"\\\"None\\\"\", \"gender_imitation\": \"\\\"no_gender_imitation\\\"\", \"style_imitation\": \"\\\"no_style_imitation\\\"\"}", "columns": ["id", "text", "account_id", "author_id", "age", "age_group", "gender", "no_imitation", "age_imitation", "gender_imitation", "style_imitation"], "columns_mapping": {"id": "id", "text": "text", "account_id": "account_id", "author_id": "author_id", "age": "age", "age_group": "age_group", "gender": "gender", "no_imitation": "no_imitation", "age_imitation": "age_imitation", "gender_imitation": "gender_imitation", "style_imitation": "style_imitation"}, "dataset_description": "he corpus for the author profiling analysis contains texts in Russian-language which labeled for 5 tasks:\n1) gender -- 13530 texts with the labels, who wrote this: text female or male;\n2) age -- 13530 texts with the labels, how old the person who wrote the text. This is a number from 12 to 80. In addition, for the classification task we added 5 age groups: 1-19; 20-29; 30-39; 40-49; 50+;\n3) age imitation -- 7574 texts, where crowdsource authors is asked to write three texts: \n a) in their natural manner, \n b) imitating the style of someone younger, \n c) imitating the style of someone older;\n4) gender imitation -- 5956 texts, where the crowdsource authors is asked to write texts: in their origin gender and pretending to be the opposite gender;\n5) style imitation -- 5956 texts, where crowdsource authors is asked to write a text on behalf of another person of your own gender, with a distortion of the authors usual style.\n", "dataset_name": "sagteam/author_profiling"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ru"], "is_gated": false}, "toloka/VoxDIY-RusNews": {"dataset_name": "toloka/VoxDIY-RusNews", "description": "VoxDIY: Benchmark Dataset for Russian Crowdsourced Audio Transcription.", "downloads": 29, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"task\": \"\\\"https://tlk.s3.yandex.net/annotation_tasks/russia...\", \"transcriptions\": \"\\\"\\\\u044d\\\\u0442\\\\u043e \\\\u0432\\\\u0438\\\\u0434\\\\u0438\\\\u043c...\", \"performers\": \"\\\"8 | 3200 | 3058 | 2702 | 2763 | 953 | 1573\\\"\", \"gt\": \"\\\"\\\\u044d\\\\u0442\\\\u043e \\\\u0432\\\\u0438\\\\u0434\\\\u0438\\\\u043c...\"}", "columns": ["task", "transcriptions", "performers", "gt"], "columns_mapping": {"task": "task", "transcriptions": "transcriptions", "performers": "performers", "gt": "gt"}, "dataset_description": "VoxDIY: Benchmark Dataset for Russian Crowdsourced Audio Transcription.\n", "dataset_name": "toloka/VoxDIY-RusNews"}}, "tags": ["task_categories:summarization", "task_categories:automatic-speech-recognition", "task_categories:text2text-generation", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ru", "conditional-text-generation", "stuctured-to-text", "speech-recognition"], "is_gated": false}, "usc-isi/WikiConvert": {"dataset_name": "usc-isi/WikiConvert", "description": "Language Modelling with Cardinal Number Annotations.", "downloads": 93, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"0\", \"UNIQUE_STORY_INDEX\": \"\\\"0\\\"\", \"offset\": \"16\", \"length\": \"4\", \"magnitude\": \"0\", \"comment\": \"\\\"With a total of 1500 miles of inland waterways, A...\", \"number\": \"1500\"}", "columns": ["id", "UNIQUE_STORY_INDEX", "offset", "length", "magnitude", "comment", "number"], "columns_mapping": {"id": "id", "UNIQUE_STORY_INDEX": "UNIQUE_STORY_INDEX", "offset": "offset", "length": "length", "magnitude": "magnitude", "comment": "comment", "number": "number"}, "dataset_description": "Language Modelling with Cardinal Number Annotations.\n", "dataset_name": "usc-isi/WikiConvert"}}, "tags": ["task_categories:fill-mask", "task_categories:other", "task_categories:text-generation", "task_ids:language-modeling", "task_ids:masked-language-modeling", "multilinguality:monolingual", "source_datasets:extended|wikipedia", "language:en", "numeracy", "natural-language-understanding", "tokenization"], "is_gated": false}, "w11wo/imdb-javanese": {"dataset_name": "w11wo/imdb-javanese", "description": "Large Movie Review Dataset translated to Javanese.\r\nThis is a dataset for binary sentiment classification containing substantially\r\nmore data than previous benchmark datasets. We provide a set of 25,000 highly\r\npolar movie reviews for training, and 25,000 for testing. There is additional\r\nunlabeled data for use as well. We translated the original IMDB Dataset to\r\nJavanese using the multi-lingual MarianMT Transformer model from\r\n`Helsinki-NLP/opus-mt-en-mul`.", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Bromwell High's komedia kom\\\\u00e9dia. Kuwi mlaku ...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "\nLarge Movie Review Dataset translated to Javanese.\nThis is a dataset for binary sentiment classification containing substantially\nmore data than previous benchmark datasets. We provide a set of 25,000 highly\npolar movie reviews for training, and 25,000 for testing. There is additional\nunlabeled data for use as well. We translated the original IMDB Dataset to\nJavanese using the multi-lingual MarianMT Transformer model from\n`Helsinki-NLP/opus-mt-en-mul`. \n", "dataset_name": "w11wo/imdb-javanese"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:jv"], "is_gated": false}, "yhavinga/mc4_nl_cleaned": {"dataset_name": "yhavinga/mc4_nl_cleaned", "description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.", "downloads": 322, "configs": {"micro": {"config_name": "micro", "sample_row": "{\"text\": \"\\\"Japanse bedrijven zijn niet alleen hondstrouw aan...\", \"timestamp\": \"\\\"2019-02-22T15:37:25Z\\\"\", \"url\": \"\\\"https://ondernemingen.bnpparibasfortis.be/nl/arti...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "tiny": {"config_name": "tiny", "sample_row": "{\"text\": \"\\\"De Engelstalige databank van ChemExper kan rechts...\", \"timestamp\": \"\\\"2017-03-24T00:03:21Z\\\"\", \"url\": \"\\\"http://gevaarlijkestoffen.be/databank/externedb.h...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"De gemeente Maassluis heeft een verzekering afges...\", \"timestamp\": \"\\\"2018-05-22T11:47:36Z\\\"\", \"url\": \"\\\"https://www.prinshendrik-maassluis.nl/vrijwillige...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "medium": {"config_name": "medium", "sample_row": "{\"text\": \"\\\"Joseph Beuys - Museum Schloss Moyland Zur Suche.Z...\", \"timestamp\": \"\\\"2017-06-23T01:44:23Z\\\"\", \"url\": \"\\\"http://www.moyland.de/nl/tentoonstellingen/joseph...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "large": {"config_name": "large", "sample_row": "{\"text\": \"\\\"1. Wil ING het Duitse Commerzbank overnemen?\\\\nVol...\", \"timestamp\": \"\\\"2019-08-23T00:08:17Z\\\"\", \"url\": \"\\\"https://www.mt.nl/nieuws/7-van-mt/wil-ing-commerz...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "full": {"config_name": "full", "sample_row": "{\"text\": \"\\\"Het is vrijdag avond half 6 en ik ga onderweg van...\", \"timestamp\": \"\\\"2020-06-04T16:28:02Z\\\"\", \"url\": \"\\\"https://sandertuinhof.com/maurtenclinic/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "micro_en_nl": {"config_name": "micro_en_nl", "sample_row": "{\"text\": \"\\\"Japanse bedrijven zijn niet alleen hondstrouw aan...\", \"timestamp\": \"\\\"2019-02-22T15:37:25Z\\\"\", \"url\": \"\\\"https://ondernemingen.bnpparibasfortis.be/nl/arti...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "tiny_en_nl": {"config_name": "tiny_en_nl", "sample_row": "{\"text\": \"\\\"IBB owner Shay Geyer greets guests at the IBB Day...\", \"timestamp\": \"\\\"2019-04-24T03:59:15Z\\\"\", \"url\": \"\\\"https://candysdirt.com/category/stage-me/page/2/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "small_en_nl": {"config_name": "small_en_nl", "sample_row": "{\"text\": \"\\\"Deze milkshake is ontzettend vezelrijk door de ba...\", \"timestamp\": \"\\\"2017-03-30T12:41:30Z\\\"\", \"url\": \"\\\"http://perfecthousewife2b.nl/gezonde-chocolade-mi...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "medium_en_nl": {"config_name": "medium_en_nl", "sample_row": "{\"text\": \"\\\"Tja je weet wel hoe dat gaat.. op een dag ben je ...\", \"timestamp\": \"\\\"2018-07-21T19:25:07Z\\\"\", \"url\": \"\\\"http://alle-mooie-dingen.blogspot.com/2011/07/\\\"...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "large_en_nl": {"config_name": "large_en_nl", "sample_row": "{\"text\": \"\\\"N.B. Automatische incasso is een voorwaarde om li...\", \"timestamp\": \"\\\"2018-02-23T02:16:17Z\\\"\", \"url\": \"\\\"https://www.wsv-vada.nl/aanmelden\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}, "full_en_nl": {"config_name": "full_en_nl", "sample_row": "{\"text\": \"\\\"Charles is instrumental in the ESG assessment and...\", \"timestamp\": \"\\\"2019-04-26T04:32:46Z\\\"\", \"url\": \"\\\"https://riaconference.ca/speaker/charles-van-thie...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A thoroughly cleaned version of the Dutch portion of the multilingual \ncolossal, cleaned version of Common Crawl's web crawl corpus (mC4) by AllenAI.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's mC4 dataset by AllenAI, with further cleaning\ndetailed in the repository README file.\n", "dataset_name": "yhavinga/mc4_nl_cleaned"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "multilinguality:en-nl", "source_datasets:extended", "language:nl", "language:en"], "is_gated": false}, "yuanchuan/annotated_reference_strings": {"dataset_name": "yuanchuan/annotated_reference_strings", "description": "A repository of reference strings annotated using CSL processor using citations obtained from various sources.", "downloads": 12, "configs": {"default": {"config_name": "default", "sample_row": "{\"source\": \"\\\"crossref\\\"\", \"lang\": \"\\\"en\\\"\", \"entry_type\": \"\\\"article\\\"\", \"doi_prefix\": \"\\\"10.1021\\\"\", \"csl_style\": \"\\\"nature\\\"\", \"content\": \"\\\"1. She...\"}", "columns": ["source", "lang", "entry_type", "doi_prefix", "csl_style", "content"], "columns_mapping": {"source": "source", "lang": "lang", "entry_type": "entry_type", "doi_prefix": "doi_prefix", "csl_style": "csl_style", "content": "content"}, "dataset_description": "A repository of reference strings annotated using CSL processor using citations obtained from various sources.\n", "dataset_name": "yuanchuan/annotated_reference_strings"}}, "tags": ["task_categories:token-classification", "task_ids:parsing", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "nlpaueb/finer-139": {"dataset_name": "nlpaueb/finer-139", "description": "FiNER-139 is a named entity recognition dataset consisting of 10K annual \nand quarterly English reports (filings) of publicly traded companies \ndownloaded from the U.S. Securities and Exchange Commission (SEC) \nannotated with 139 XBRL tags in the IOB2 format.", "downloads": 213, "configs": {"finer-139": {"config_name": "finer-139", "sample_row": "{\"id\": \"0\", \"tokens\": \"[\\\"ITEM\\\", \\\"1\\\", \\\"Financial\\\", \\\"Statements\\\", \\\"Lennar\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nFiNER-139 is a named entity recognition dataset consisting of 10K annual \nand quarterly English reports (filings) of publicly traded companies \ndownloaded from the U.S. Securities and Exchange Commission (SEC) \nannotated with 139 XBRL tags in the IOB2 format.\n", "dataset_name": "nlpaueb/finer-139"}}, "tags": ["task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:en"], "is_gated": false}, "ruanchaves/snap": {"dataset_name": "ruanchaves/snap", "description": "Automatically segmented 803K SNAP Twitter Data Set hashtags with the heuristic described in the paper \"Segmenting hashtags using automatically created training data\".", "downloads": 10, "configs": {"default": {"config_name": "default", "sample_row": "{\"index\": \"0\", \"hashtag\": \"\\\"BrandThunder\\\"\", \"segmentation\": \"\\\"Brand Thunder\\\"\"}", "columns": ["index", "hashtag", "segmentation"], "columns_mapping": {"index": "index", "hashtag": "hashtag", "segmentation": "segmentation"}, "dataset_description": "\nAutomatically segmented 803K SNAP Twitter Data Set hashtags with the heuristic described in the paper \"Segmenting hashtags using automatically created training data\".\n", "dataset_name": "ruanchaves/snap"}}, "tags": ["annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "word-segmentation"], "is_gated": false}, "CLUTRR/v1": {"dataset_name": "CLUTRR/v1", "description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.", "downloads": 1686, "configs": {"gen_train23_test2to10": {"config_name": "gen_train23_test2to10", "sample_row": "{\"id\": \"\\\"f4161421-bf6e-4165-9133-07f1dcc4c87e\\\"\", \"story\": \"\\\"[Dorothy]'s brother [Michael] and her went to get...\", \"query\": \"\\\"('Donald', 'Dorothy')\\\"\", \"target\": \"0\", \"target_text\": \"\\\"aunt\\\"\", \"clean_story\": \"\\\"[Michael] is the proud father of the lovely [Dona...\", \"proof_state\": \"\\\"[{('Donald', 'aunt', 'Dorothy'): [('Donald', 'fat...\", \"f_comb\": \"\\\"father-sister\\\"\", \"task_name\": \"\\\"task_1.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2)]\\\"\", \"edge_types\": \"\\\"['father', 'sister']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Donald:male,Michael:male,Dorothy:female\\\"\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "gen_train234_test2to10": {"config_name": "gen_train234_test2to10", "sample_row": "{\"id\": \"\\\"0fc660c1-e7d5-41fb-8d72-d2beb2c8d2ef\\\"\", \"story\": \"\\\"[Ashley]'s daughter, [Lillian], asked her mom to ...\", \"query\": \"\\\"('Ashley', 'Nicholas')\\\"\", \"target\": \"15\", \"target_text\": \"\\\"son\\\"\", \"clean_story\": \"\\\"[Ashley]'s daughter, [Lillian], asked her mom to ...\", \"proof_state\": \"\\\"[{('Ashley', 'son', 'Nicholas'): [('Ashley', 'dau...\", \"f_comb\": \"\\\"daughter-brother\\\"\", \"task_name\": \"\\\"task_1.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2)]\\\"\", \"edge_types\": \"\\\"['daughter', 'brother']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Ashley:female,Lillian:female,Nicholas:male\\\"\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_clean_23_test_all_23": {"config_name": "rob_train_clean_23_test_all_23", "sample_row": "{\"id\": \"\\\"00eabf71-62a0-446b-9c7d-df3896af0eb2\\\"\", \"story\": \"\\\"[Herman] asked his son, [James], to go grocery sh...\", \"query\": \"\\\"('Herman', 'Rosalee')\\\"\", \"target\": \"9\", \"target_text\": \"\\\"daughter-in-law\\\"\", \"clean_story\": \"\\\"[Herman] asked his son, [James], to go grocery sh...\", \"proof_state\": \"\\\"[{('Herman', 'daughter-in-law', 'Rosalee'): [('He...\", \"f_comb\": \"\\\"son-wife\\\"\", \"task_name\": \"\\\"task_1.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2)]\\\"\", \"edge_types\": \"\\\"['son', 'wife']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Herman:male,James:male,Rosalee:female\\\"\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_disc_23_test_all_23": {"config_name": "rob_train_disc_23_test_all_23", "sample_row": "{\"id\": \"\\\"e37f6b88-e6e6-403d-a9b3-efb50c2a2044\\\"\", \"story\": \"\\\"[Kathleen] and her son-in-law [William] went to v...\", \"query\": \"\\\"('James', 'John')\\\"\", \"target\": \"3\", \"target_text\": \"\\\"brother\\\"\", \"clean_story\": \"\\\"[Kathryn] likes baking brownies for her son [John...\", \"proof_state\": \"\\\"[{('James', 'brother', 'John'): [('James', 'mothe...\", \"f_comb\": \"\\\"mother-son\\\"\", \"task_name\": \"\\\"task_4.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2), (3, 4)]\\\"\", \"edge_types\": \"\\\"['mother', 'son']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"James:male,Kathryn:female,John:male,Kathleen:fema...\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_irr_23_test_all_23": {"config_name": "rob_train_irr_23_test_all_23", "sample_row": "{\"id\": \"\\\"3e868d06-78e4-47ec-9272-4538d95214d4\\\"\", \"story\": \"\\\"[Brian] is one of [Geraldine]'s brothers. They ha...\", \"query\": \"\\\"('Geraldine', 'Nancy')\\\"\", \"target\": \"0\", \"target_text\": \"\\\"aunt\\\"\", \"clean_story\": \"\\\"[Preston] took his daughter [Geraldine] to ballet...\", \"proof_state\": \"\\\"[{('Geraldine', 'aunt', 'Nancy'): [('Geraldine', ...\", \"f_comb\": \"\\\"father-sister\\\"\", \"task_name\": \"\\\"task_3.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2), (0, 3)]\\\"\", \"edge_types\": \"\\\"['father', 'sister']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Geraldine:female,Preston:male,Nancy:female,Brian:...\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}, "rob_train_sup_23_test_all_23": {"config_name": "rob_train_sup_23_test_all_23", "sample_row": "{\"id\": \"\\\"2ece8482-1db7-4395-9cb1-fa472b041e7d\\\"\", \"story\": \"\\\"[Darnell] loved his mother, [Theresa]. [Theresa] ...\", \"query\": \"\\\"('Amanda', 'Michelle')\\\"\", \"target\": \"4\", \"target_text\": \"\\\"sister\\\"\", \"clean_story\": \"\\\"[Theresa] was so proud of her daughter [Amanda] f...\", \"proof_state\": \"\\\"[{('Amanda', 'sister', 'Michelle'): [('Amanda', '...\", \"f_comb\": \"\\\"mother-daughter\\\"\", \"task_name\": \"\\\"task_2.2\\\"\", \"story_edges\": \"\\\"[(0, 1), (1, 2), (1, 3), (3, 2)]\\\"\", \"edge_types\": \"\\\"['mother', 'daughter']\\\"\", \"query_edge\": \"\\\"(0, 2)\\\"\", \"genders\": \"\\\"Amanda:female,Theresa:female,Michelle:female,Darn...\", \"task_split\": \"\\\"train\\\"\"}", "columns": ["id", "story", "query", "target", "target_text", "clean_story", "proof_state", "f_comb", "task_name", "story_edges", "edge_types", "query_edge", "genders", "task_split"], "columns_mapping": {"id": "id", "story": "story", "query": "query", "target": "target", "target_text": "target_text", "clean_story": "clean_story", "proof_state": "proof_state", "f_comb": "f_comb", "task_name": "task_name", "story_edges": "story_edges", "edge_types": "edge_types", "query_edge": "query_edge", "genders": "genders", "task_split": "task_split"}, "dataset_description": "CLUTRR (Compositional Language Understanding and Text-based Relational Reasoning),\n a diagnostic benchmark suite, is first introduced in (https://arxiv.org/abs/1908.06177) \n to test the systematic generalization and inductive reasoning capabilities of NLU systems.\n", "dataset_name": "CLUTRR/v1"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "ai4bharat/IndicHeadlineGeneration": {"dataset_name": "ai4bharat/IndicHeadlineGeneration", "description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.", "downloads": 33, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u09ac\\\\u09bf\\\\u09b7\\\\u09df \\\\u09a8\\\\u09bf\\\\u09b0\\\\u09cd...\", \"target\": \"\\\"\\\\u09a8\\\\u09bf\\\\u09b0\\\\u09cd\\\\u09ac\\\\u09be\\\\u099a\\\\u09a8 ...\", \"url\": \"\\\"https://bengali.oneindia.com/topic/%E0%A6%A8%E0%A...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u099c \\\\u098f.\\\\u098f\\\\u099b.\\\\u098f\\\\u09a8. \\\\u09aa\\\\u...\", \"target\": \"\\\"\\\\u09ac\\\\u0999\\\\u09be\\\\u0987\\\\u0997\\\\u09be\\\\u0981\\\\u09f1\\\\...\", \"url\": \"\\\"https://www.newsasn.com/index.php/node/3523\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "gu": {"config_name": "gu", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a9c\\\\u0abe\\\\u0a82\\\\u0aac\\\\u0ac1\\\\u0aa1\\\\u0ac0\\\\u0aaf\\\\...\", \"target\": \"\\\"\\\\u0aae\\\\u0acb\\\\u0ab0\\\\u0aac\\\\u0ac0\\\\u0aa8\\\\u0abe \\\\u0ab8...\", \"url\": \"\\\"http://abtakmedia.com/seven-villages-of-morbi-are...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0905\\\\u092e\\\\u0947...\", \"target\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0908\\\\u0930\\\\u093e...\", \"url\": \"null\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0cc0\\\\...\", \"target\": \"\\\"\\\\u0cb0\\\\u0cab\\\\u0cc7\\\\u0cb2\\\\u0ccd \\\\u0c96\\\\u0cb0\\\\u0cc0...\", \"url\": \"\\\"https://vknews.in/352131/?responsive=false\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0d24\\\\u0d3f\\\\u0d30\\\\u0d41\\\\u0d35\\\\u0d28\\\\u0d28\\\\u0d4d\\\\...\", \"target\": \"\\\"\\\\u0d2a\\\\u0d3e\\\\u0d32\\\\u0d41\\\\u0d02 \\\\u0d15\\\\u0d1f\\\\u0d4d...\", \"url\": \"\\\"https://malayalam.oneindia.com/news/kerala/hotels...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "mr": {"config_name": "mr", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0909\\\\u092a\\\\u092e\\\\u0941\\\\u0916\\\\u094d\\\\u092f\\\\u092e\\\\...\", \"target\": \"\\\"\\\\u092e\\\\u0924\\\\u092d\\\\u093f\\\\u0928\\\\u094d\\\\u0928\\\\u0924\\\\...\", \"url\": \"\\\"https://www.dainikprabhat.com/despite-differences...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0b13\\\\u0b21\\\\u0b3f\\\\u0b36\\\\u0b3e , \\\\u0b30\\\\u0b3e\\\\u0b...\", \"target\": \"\\\"\\\\u0b2a\\\\u0b3e\\\\u0b1f\\\\u0b15\\\\u0b41\\\\u0b30\\\\u0b3e\\\\u0b30\\\\...\", \"url\": \"\\\"http://utkalexpress.in/?p=23750\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a28\\\\u0a4b\\\\u0a2c\\\\u0a47\\\\u0a32 \\\\u0a2a\\\\u0a41\\\\u0a30...\", \"target\": \"\\\"\\\\u0a38\\\\u0a3f\\\\u0a39\\\\u0a24 \\\\u0a2e\\\\u0a70\\\\u0a24\\\\u0a30...\", \"url\": \"\\\"https://newsnumber.com/news/story/157908\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0ba4\\\\u0bc2\\\\u0ba4\\\\u0bcd\\\\u0ba4\\\\u0bc1\\\\u0b95\\\\u0bcd\\\\...\", \"target\": \"\\\"\\\\u0b85\\\\u0bae\\\\u0bc8\\\\u0b9a\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\u0bcd ...\", \"url\": \"\\\"https://tamil.oneindia.com/news/tuticorin/manikar...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c30\\\\u0c3f 23, 2019 174 \\\\u0c0...\", \"target\": \"\\\"\\\\u0c2e\\\\u0c39\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c2f\\\\u0c15\\\\u0c41\\\\...\", \"url\": \"\\\"https://manalokam.com/cinema/ntr-mahanayakudu-fir...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the new headline generation dataset released as part of IndicNLG Suite. Each \ninput document is paired an output title. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. The total\nsize of the dataset is 1.43M.\n", "dataset_name": "ai4bharat/IndicHeadlineGeneration"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original for Hindi, and modified [IndicGLUE](https://indicnlp.ai4bharat.org/indic-glue/) for other languages.", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ai4bharat/IndicSentenceSummarization": {"dataset_name": "ai4bharat/IndicSentenceSummarization", "description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.", "downloads": 77, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u09b2\\\\u0996\\\\u09a8\\\\u0989: \\\\u0989\\\\u09a4\\\\u09cd\\\\u09a...\", \"target\": \"\\\"\\\\u09ee\\\\u09ec \\\\u09b2\\\\u0995\\\\u09cd\\\\u09b7 \\\\u0995\\\\u09c...\", \"url\": \"\\\"https://bengali.abplive.com/news/nation/yogi-adit...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u099c \\\\u098f.\\\\u098f\\\\u099b.\\\\u098f\\\\u09a8. \\\\u09aa\\\\u...\", \"target\": \"\\\"\\\\u09ac\\\\u0999\\\\u09be\\\\u0987\\\\u0997\\\\u09be\\\\u0981\\\\u09f1\\\\...\", \"url\": \"\\\"https://www.newsasn.com/index.php/node/3523\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "gu": {"config_name": "gu", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a9c\\\\u0abe\\\\u0a82\\\\u0aac\\\\u0ac1\\\\u0aa1\\\\u0ac0\\\\u0aaf\\\\...\", \"target\": \"\\\"\\\\u0aae\\\\u0acb\\\\u0ab0\\\\u0aac\\\\u0ac0\\\\u0aa8\\\\u0abe \\\\u0ab8...\", \"url\": \"\\\"http://abtakmedia.com/seven-villages-of-morbi-are...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0905\\\\u092e\\\\u0947...\", \"target\": \"\\\"\\\\u0915\\\\u0928\\\\u093e\\\\u0921\\\\u093e \\\\u0908\\\\u0930\\\\u093e...\", \"url\": \"null\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0cb0\\\\u0cbe\\\\u0cb7\\\\u0ccd\\\\u0c9f\\\\u0ccd\\\\u0cb0\\\\u0cc0\\\\...\", \"target\": \"\\\"\\\\u0cb0\\\\u0cab\\\\u0cc7\\\\u0cb2\\\\u0ccd \\\\u0c96\\\\u0cb0\\\\u0cc0...\", \"url\": \"\\\"https://vknews.in/352131/?responsive=false\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0d24\\\\u0d3f\\\\u0d30\\\\u0d41\\\\u0d35\\\\u0d28\\\\u0d28\\\\u0d4d\\\\...\", \"target\": \"\\\"\\\\u0d2a\\\\u0d3e\\\\u0d32\\\\u0d41\\\\u0d02 \\\\u0d15\\\\u0d1f\\\\u0d4d...\", \"url\": \"\\\"https://malayalam.oneindia.com/news/kerala/hotels...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "mr": {"config_name": "mr", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0909\\\\u092a\\\\u092e\\\\u0941\\\\u0916\\\\u094d\\\\u092f\\\\u092e\\\\...\", \"target\": \"\\\"\\\\u092e\\\\u0924\\\\u092d\\\\u093f\\\\u0928\\\\u094d\\\\u0928\\\\u0924\\\\...\", \"url\": \"\\\"https://www.dainikprabhat.com/despite-differences...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0b13\\\\u0b21\\\\u0b3f\\\\u0b36\\\\u0b3e , \\\\u0b30\\\\u0b3e\\\\u0b...\", \"target\": \"\\\"\\\\u0b2a\\\\u0b3e\\\\u0b1f\\\\u0b15\\\\u0b41\\\\u0b30\\\\u0b3e\\\\u0b30\\\\...\", \"url\": \"\\\"http://utkalexpress.in/?p=23750\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0a28\\\\u0a4b\\\\u0a2c\\\\u0a47\\\\u0a32 \\\\u0a2a\\\\u0a41\\\\u0a30...\", \"target\": \"\\\"\\\\u0a38\\\\u0a3f\\\\u0a39\\\\u0a24 \\\\u0a2e\\\\u0a70\\\\u0a24\\\\u0a30...\", \"url\": \"\\\"https://newsnumber.com/news/story/157908\\\"\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0ba4\\\\u0bc2\\\\u0ba4\\\\u0bcd\\\\u0ba4\\\\u0bc1\\\\u0b95\\\\u0bcd\\\\...\", \"target\": \"\\\"\\\\u0b85\\\\u0bae\\\\u0bc8\\\\u0b9a\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\u0bcd ...\", \"url\": \"\\\"https://tamil.oneindia.com/news/tuticorin/manikar...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"input\": \"\\\"\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c30\\\\u0c3f 23, 2019 174 \\\\u0c0...\", \"target\": \"\\\"\\\\u0c2e\\\\u0c39\\\\u0c3e\\\\u0c28\\\\u0c3e\\\\u0c2f\\\\u0c15\\\\u0c41\\\\...\", \"url\": \"\\\"https://manalokam.com/cinema/ntr-mahanayakudu-fir...\"}", "columns": ["id", "input", "target", "url"], "columns_mapping": {"id": "id", "input": "input", "target": "target", "url": "url"}, "dataset_description": "This is the sentence summarization dataset released as part of IndicNLG Suite. Each \ninput sentence is paired with an output summary. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta and te. The total\nsize of the dataset is 431K.\n", "dataset_name": "ai4bharat/IndicSentenceSummarization"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original for Hindi, and modified [IndicGLUE](https://indicnlp.ai4bharat.org/indic-glue/) for other languages.", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ai4bharat/IndicWikiBio": {"dataset_name": "ai4bharat/IndicWikiBio", "description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.", "downloads": 52, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u09b6\\\\u09f0\\\\u09ce\\\\u099a\\\\u09a8\\\\u09cd\\\\u09a6...\", \"serialized_infobox\": \"\\\" name \\\\u09b6\\\\u09f0\\\\u09ce\\\\u099a\\\\u09a8\\\\...\", \"summary\": \"\\\"\\\\u09b6\\\\u09f0\\\\u09ce\\\\u099a\\\\u09a8\\\\u09cd\\\\u09a6\\\\u09cd\\\\...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"bgcolour_1:#\\\\tbgcolour_2:6495ED\\\\tname_1:\\\\u0987\\\\u0...\", \"serialized_infobox\": \"\\\" bgcolour # 6495ED name ...\", \"summary\": \"\\\"\\\\u0987\\\\u09ae\\\\u09a6\\\\u09be\\\\u09a6 \\\\u09b9\\\\u09cb\\\\u09b8...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0905\\\\u091c\\\\u092e\\\\u0947\\\\u0930\\\\tname_2:\\\\u0...\", \"serialized_infobox\": \"\\\" name \\\\u0905\\\\u091c\\\\u092e\\\\u0947\\\\u0930 ...\", \"summary\": \"\\\"\\\\u0905\\\\u091c\\\\u092e\\\\u0947\\\\u0930 \\\\u0930\\\\u094b\\\\u0921...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0c86\\\\u0cb0\\\\u0ccd\\\\tname_2:.\\\\tname_3:\\\\u0c8...\", \"serialized_infobox\": \"\\\" name \\\\u0c86\\\\u0cb0\\\\u0ccd . \\\\u0c8e\\\\u0c...\", \"summary\": \"\\\"\\\\u0c86\\\\u0cb0\\\\u0ccd. \\\\u0c8e\\\\u0ca8\\\\u0ccd. \\\\u0c9c\\\\u0...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0d35\\\\u0d3e\\\\u0d38\\\\u0d4d\\\\u0d35\\\\u0d4b\\\\tname...\", \"serialized_infobox\": \"\\\" name \\\\u0d35\\\\u0d3e\\\\u0d38\\\\u0d4d\\\\u0d35\\\\...\", \"summary\": \"\\\"\\\\u0d05\\\\u0d2e\\\\u0d47\\\\u0d30\\\\u0d3f\\\\u0d15\\\\u0d4d\\\\u0d15\\\\...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0b38\\\\u0b4b\\\\u0b28\\\\u0b3f\\\\u0b15\\\\u0b3e\\\\tname...\", \"serialized_infobox\": \"\\\" name \\\\u0b38\\\\u0b4b\\\\u0b28\\\\u0b3f\\\\u0b15\\\\...\", \"summary\": \"\\\"\\\\u0b38\\\\u0b4b\\\\u0b28\\\\u0b3f\\\\u0b15\\\\u0b3e \\\\u0b30\\\\u0b3e...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0a38\\\\u0a42\\\\u0a30\\\\u0a1c\\\\tname_2:\\\\u0a2a\\\\u0...\", \"serialized_infobox\": \"\\\" name \\\\u0a38\\\\u0a42\\\\u0a30\\\\u0a1c \\\\u0a2a...\", \"summary\": \"\\\"\\\\u0a38\\\\u0a42\\\\u0a30\\\\u0a1c \\\\u0a2a\\\\u0a3e\\\\u0a32\\\\u0a40...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0ba8\\\\u0b9e\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\u0bbe\\\\u0b9a...\", \"serialized_infobox\": \"\\\" name \\\\u0ba8\\\\u0b9e\\\\u0bcd\\\\u0b9a\\\\u0bb0\\\\...\", \"summary\": \"\\\"\\\\u0bae\\\\u0b95\\\\u0bbe\\\\u0bb0\\\\u0bbe\\\\u0b9a\\\\u0bbe \\\\u0b9a...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"infobox\": \"\\\"name_1:\\\\u0c2f\\\\u0c02\\\\tname_2:.\\\\tname_3:\\\\u0c2f\\\\u0c0...\", \"serialized_infobox\": \"\\\" name \\\\u0c2f\\\\u0c02 . \\\\u0c2f\\\\u0c02 . \\\\...\", \"summary\": \"\\\"\\\\u0c2f\\\\u0c02. \\\\u0c2f\\\\u0c02. \\\\u0c36\\\\u0c4d\\\\u0c30\\\\u0...\"}", "columns": ["id", "infobox", "serialized_infobox", "summary"], "columns_mapping": {"id": "id", "infobox": "infobox", "serialized_infobox": "serialized_infobox", "summary": "summary"}, "dataset_description": "This is the WikiBio dataset released as part of IndicNLG Suite. Each \nexample has four fields: id, infobox, serialized infobox and summary. We create this dataset in nine \nlanguages including as, bn, hi, kn, ml, or, pa, ta, te. The total\nsize of the dataset is 57,426.\n", "dataset_name": "ai4bharat/IndicWikiBio"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:none. Originally generated from www.wikimedia.org.", "language:as", "language:bn", "language:hi", "language:kn", "language:ml", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ai4bharat/IndicQuestionGeneration": {"dataset_name": "ai4bharat/IndicQuestionGeneration", "description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.", "downloads": 37, "configs": {"as": {"config_name": "as", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6 \\\\u099a\\\\u09a8\\\\u09a4\\\"\", \"context\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6 \\\\u099a\\\\u09a8\\\\u09a4 \\\\u099...\", \"question\": \"\\\"\\\\u0995\\\\u09be\\\\u09f0\\\\u09cd\\\\u09b2 \\\\u09b9\\\\u09be\\\\u0987...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6 \\\\u098f\\\\u09b0 \\\\u09a6\\\\u09b...\", \"context\": \"\\\"\\\\u09e7\\\\u09ee\\\\u09ec\\\\u09e6-\\\\u098f\\\\u09b0 \\\\u09a6\\\\u09b...\", \"question\": \"\\\"\\\\u0995\\\\u09be\\\\u09b0\\\\u09cd\\\\u09b2 \\\\u09b9\\\\u09be\\\\u0987...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "gu": {"config_name": "gu", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0aa8\\\\u0abe \\\\u0aa6\\\\u0abe\\\\u0aaf\\\\u0a95\\\\u0abe\\\\...\", \"context\": \"\\\"1860\\\\u0aa8\\\\u0abe \\\\u0aa6\\\\u0abe\\\\u0aaf\\\\u0a95\\\\u0abe\\\\u...\", \"question\": \"\\\"\\\\u0a95\\\\u0abe\\\\u0ab0\\\\u0acd\\\\u0ab2 \\\\u0ab9\\\\u0ac7\\\\u0aa8...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u0967\\\\u096e\\\\u096c\\\\u0966 \\\\u0915\\\\u0947 \\\\u0926\\\\u093...\", \"context\": \"\\\"1860 \\\\u0915\\\\u0947 \\\\u0926\\\\u0936\\\\u0915 \\\\u092e\\\\u0947...\", \"question\": \"\\\"\\\\u0915\\\\u093e\\\\u0930\\\\u094d\\\\u0932 \\\\u0939\\\\u0947\\\\u0928...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "kn": {"config_name": "kn", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0cb0\\\\u0cb2\\\\u0ccd\\\\u0cb2\\\\u0cbf\\\"\", \"context\": \"\\\"1860\\\\u0cb0 \\\\u0ca6\\\\u0cb6\\\\u0c95\\\\u0ca6\\\\u0cb2\\\\u0ccd\\\\u...\", \"question\": \"\\\"\\\\u0c95\\\\u0cbe\\\\u0cb0\\\\u0ccd\\\\u0cb2\\\\u0ccd \\\\u0cb9\\\\u0cc6...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "ml": {"config_name": "ml", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0d15\\\\u0d33\\\\u0d3f\\\\u0d7d\\\"\", \"context\": \"\\\"1860 \\\\u0d15\\\\u0d33\\\\u0d3f\\\\u0d7d \\\\u0d15\\\\u0d3e\\\\u0d7e ...\", \"question\": \"\\\"\\\\u0d15\\\\u0d3e\\\\u0d7e \\\\u0d39\\\\u0d46\\\\u0d7b\\\\u0d31\\\\u0d3f...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "mr": {"config_name": "mr", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u0967\\\\u096e\\\\u096c\\\\u0966 \\\\u091a\\\\u094d\\\\u092f\\\\u093e...\", \"context\": \"\\\"\\\\u0967\\\\u096e\\\\u096c\\\\u0966 \\\\u091a\\\\u094d\\\\u092f\\\\u093e...\", \"question\": \"\\\"\\\\u0915\\\\u093e\\\\u0930\\\\u094d\\\\u0932 \\\\u0939\\\\u0947\\\\u0928...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "or": {"config_name": "or", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"\\\\u0b67\\\\u0b6e\\\\u0b6c\\\\u0b66 \\\\u0b2e\\\\u0b38\\\\u0b3f\\\\u0b39...\", \"context\": \"\\\"\\\\u0b67\\\\u0b6e\\\\u0b6c\\\\u0b66 \\\\u0b2e\\\\u0b38\\\\u0b3f\\\\u0b39...\", \"question\": \"\\\"\\\\u0b15\\\\u0b3e\\\\u0b30\\\\u0b4d\\\\u0b32 \\\\u0b39\\\\u0b3e\\\\u0b07...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "pa": {"config_name": "pa", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0a26\\\\u0a47 \\\\u0a26\\\\u0a39\\\\u0a3e\\\\u0a15\\\\u0a47\\\"...\", \"context\": \"\\\"\\\\u0a38\\\\u0a3c\\\\u0a41\\\\u0a30\\\\u0a42\\\\u0a06\\\\u0a24\\\\u0a40 ...\", \"question\": \"\\\"\\\\u0a15\\\\u0a3e\\\\u0a30\\\\u0a32 \\\\u0a39\\\\u0a47\\\\u0a28\\\\u0a30...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "ta": {"config_name": "ta", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860 \\\\u0b95\\\\u0bb3\\\\u0bbf\\\\u0bb2\\\\u0bcd\\\"\", \"context\": \"\\\"1860 \\\\u0b95\\\\u0bb3\\\\u0bbf\\\\u0bb2\\\\u0bcd \\\\u0b95\\\\u0bbe\\\\...\", \"question\": \"\\\"\\\\u0b95\\\\u0bbe\\\\u0bb0\\\\u0bcd\\\\u0bb2\\\\u0bcd \\\\u0bb9\\\\u0bc8...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}, "te": {"config_name": "te", "sample_row": "{\"id\": \"\\\"1\\\"\", \"squad_id\": \"\\\"57101dc2b654c5140001f80a\\\"\", \"answer\": \"\\\"1860\\\\u0c32\\\\u0c32\\\\u0c4b\\\"\", \"context\": \"\\\"1860\\\\u0c32\\\\u0c32\\\\u0c4b \\\\u0c15\\\\u0c3e\\\\u0c30\\\\u0c4d\\\\u...\", \"question\": \"\\\"\\\\u0c15\\\\u0c3e\\\\u0c30\\\\u0c4d\\\\u0c32\\\\u0c4d \\\\u0c39\\\\u0c46...\"}", "columns": ["id", "squad_id", "answer", "context", "question"], "columns_mapping": {"id": "id", "squad_id": "squad_id", "answer": "answer", "context": "context", "question": "question"}, "dataset_description": "This is the Question Generation dataset released as part of IndicNLG Suite. Each \nexample has five fields: id, squad_id, answer, context and question. We create this dataset in eleven \nlanguages including as, bn, gu, hi, kn, ml, mr, or, pa, ta, te. This is a translated data. The examples in each language are exactly similar but in different languages. \nThe number of examples in each language is 98,027.\n", "dataset_name": "ai4bharat/IndicQuestionGeneration"}}, "tags": ["annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:we start with the SQuAD question answering dataset repurposed to serve as a question generation dataset. We translate this dataset into different Indic languages.", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "ruanchaves/reddit_china": {"dataset_name": "ruanchaves/reddit_china", "description": "Reddit comments with the word 'China' between 2010 and 2022.", "downloads": 106, "configs": {"default": {"config_name": "default", "sample_row": "{\"author\": \"\\\"Grunt08\\\"\", \"author_fullname\": \"\\\"t2_c8dzz\\\"\", \"body\": \"\\\"They also have encrypted radios, but they're made...\", \"created_utc\": \"1646931561\", \"id\": \"\\\"i04kut4\\\"\", \"is_submitter\": \"false\", \"link_id\": \"\\\"t3_tb2v7q\\\"\", \"locked\": \"false\", \"no_follow\": \"true\", \"parent_id\": \"\\\"t1_i04j877\\\"\", \"permalink\": \"\\\"/r/ukraine/comments/tb2v7q/apparently_russian_mil...\", \"retrieved_on\": \"null\", \"score\": \"1\", \"send_replies\": \"true\", \"stickied\": \"false\", \"subreddit\": \"\\\"ukraine\\\"\", \"subreddit_id\": \"\\\"t5_2qqcn\\\"\"}", "columns": ["author", "author_fullname", "body", "created_utc", "id", "is_submitter", "link_id", "locked", "no_follow", "parent_id", "permalink", "retrieved_on", "score", "send_replies", "stickied", "subreddit", "subreddit_id"], "columns_mapping": {"author": "author", "author_fullname": "author_fullname", "body": "body", "created_utc": "created_utc", "id": "id", "is_submitter": "is_submitter", "link_id": "link_id", "locked": "locked", "no_follow": "no_follow", "parent_id": "parent_id", "permalink": "permalink", "retrieved_on": "retrieved_on", "score": "score", "send_replies": "send_replies", "stickied": "stickied", "subreddit": "subreddit", "subreddit_id": "subreddit_id"}, "dataset_description": "\nReddit comments with the word 'China' between 2010 and 2022.\n", "dataset_name": "ruanchaves/reddit_china"}}, "tags": [], "is_gated": false}, "wikitablequestions": {"dataset_name": "wikitablequestions", "description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.", "downloads": 1889, "configs": {"random-split-1": {"config_name": "random-split-1", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-2": {"config_name": "random-split-2", "sample_row": "{\"id\": \"\\\"nt-2\\\"\", \"question\": \"\\\"which team won previous to crettyard?\\\"\", \"answers\": \"[\\\"Wolfe Tones\\\"]\", \"table.header\": \"[\\\"Team\\\", \\\"County\\\", \\\"Wins\\\", \\\"Years won\\\"]\", \"table.rows\": \"[[\\\"Greystones\\\", \\\"Wicklow\\\", \\\"1\\\", \\\"2011\\\"], [\\\"Ballymo...\", \"table.name\": \"\\\"csv/204-csv/772.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-3": {"config_name": "random-split-3", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-4": {"config_name": "random-split-4", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}, "random-split-5": {"config_name": "random-split-5", "sample_row": "{\"id\": \"\\\"nt-0\\\"\", \"question\": \"\\\"what was the last year where this team was a part...\", \"answers\": \"[\\\"2004\\\"]\", \"table.header\": \"[\\\"Year\\\", \\\"Division\\\", \\\"League\\\", \\\"Regular Season\\\", \\\"...\", \"table.rows\": \"[[\\\"2001\\\", \\\"2\\\", \\\"USL A-League\\\", \\\"4th, Western\\\", \\\"Qu...\", \"table.name\": \"\\\"csv/204-csv/590.tsv\\\"\"}", "columns": ["id", "question", "answers", "table_header", "table_rows", "table_name"], "columns_mapping": {"id": "id", "question": "question", "answers": "answers", "table.header": "table_header", "table.rows": "table_rows", "table.name": "table_name"}, "dataset_description": "This WikiTableQuestions dataset is a large-scale dataset for the task of question answering on semi-structured tables.\n", "dataset_name": "wikitablequestions"}}, "tags": ["task_categories:question-answering", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "table-question-answering"], "is_gated": false}, "GEM/xwikis": {"dataset_name": "GEM/xwikis", "description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.", "downloads": 141, "configs": {"en-fr": {"config_name": "en-fr", "sample_row": "{\"gem_id\": \"\\\"en-fr-train-86694\\\"\", \"gem_parent_id\": \"\\\"en-fr-train-86694\\\"\", \"id\": \"\\\"86694\\\"\", \"src_title\": \"\\\"Abstract algebra\\\"\", \"tgt_title\": \"\\\"Alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale\\\"\", \"src_document.title\": \"[\\\"History.\\\", \\\"Early group theory.\\\", \\\"Modern algebr...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"As in other parts of mathematics, concrete probl...\", \"src_summary\": \"\\\"In algebra, which is a broad division of mathemat...\", \"tgt_summary\": \"\\\"L'alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale, ou alg\\\\u00e8br...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en-cs": {"config_name": "en-cs", "sample_row": "{\"gem_id\": \"\\\"en-cs-train-730484\\\"\", \"gem_parent_id\": \"\\\"en-cs-train-730484\\\"\", \"id\": \"\\\"730484\\\"\", \"src_title\": \"\\\"Astronomy\\\"\", \"tgt_title\": \"\\\"Astronomie\\\"\", \"src_document.title\": \"[\\\"Etymology.\\\", \\\"Use of terms \\\\\\\"astronomy\\\\\\\" and \\\\\\\"a...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\\\\"Astronomy\\\\\\\" (from the Greek \\\\u1f00\\\\u03c3\\\\u03c4...\", \"src_summary\": \"\\\"Astronomy (from ) is a natural science that studi...\", \"tgt_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en-de": {"config_name": "en-de", "sample_row": "{\"gem_id\": \"\\\"en-de-train-610175\\\"\", \"gem_parent_id\": \"\\\"en-de-train-610175\\\"\", \"id\": \"\\\"610175\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"History.\\\", \\\"Uses.\\\", \\\"Film direction.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\"]\", \"src_document.content\": \"[\\\"Before 1968, DGA rules did not permit directors ...\", \"src_summary\": \"\\\"Alan Smithee (also Allen Smithee) is an official ...\", \"tgt_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en-zh": {"config_name": "en-zh", "sample_row": "{\"gem_id\": \"\\\"en-zh-train-2184703\\\"\", \"gem_parent_id\": \"\\\"en-zh-train-2184703\\\"\", \"id\": \"\\\"2184703\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"\\\\u827e\\\\u4f26\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f\\\"\", \"src_document.title\": \"[\\\"History.\\\", \\\"Uses.\\\", \\\"Film direction.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\"]\", \"src_document.content\": \"[\\\"Before 1968, DGA rules did not permit directors ...\", \"src_summary\": \"\\\"Alan Smithee (also Allen Smithee) is an official ...\", \"tgt_summary\": \"\\\"\\\\u827e\\\\u4f26\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f(\\\\u82f1\\\\u8bed...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-en": {"config_name": "fr-en", "sample_row": "{\"gem_id\": \"\\\"fr-en-train-1566090\\\"\", \"gem_parent_id\": \"\\\"fr-en-train-1566090\\\"\", \"id\": \"\\\"1566090\\\"\", \"src_title\": \"\\\"Antoine Meillet\\\"\", \"tgt_title\": \"\\\"Antoine Meillet\\\"\", \"src_document.title\": \"[\\\"Biographie.\\\", \\\"\\\\u00c9tudes hom\\\\u00e9riques.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"D'origine bourbonnaise, fils d'un notaire de Ch\\\\...\", \"src_summary\": \"\\\"Paul Jules Antoine Meillet, n\\\\u00e9 le \\\\u00e0 Mou...\", \"tgt_summary\": \"\\\"Paul Jules Antoine Meillet (; 11 November 1866, M...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-cs": {"config_name": "fr-cs", "sample_row": "{\"gem_id\": \"\\\"fr-cs-train-37687\\\"\", \"gem_parent_id\": \"\\\"fr-cs-train-37687\\\"\", \"id\": \"\\\"37687\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"Astronomie\\\"\", \"src_document.title\": \"[\\\"Histoire.\\\", \\\"N\\\\u00e9olithique.\\\", \\\"Antiquit\\\\u00e9...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"L'astronomie est consid\\\\u00e9r\\\\u00e9e comme la p...\", \"src_summary\": \"\\\"L\\\\u2019astronomie est la science de l\\\\u2019observ...\", \"tgt_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-de": {"config_name": "fr-de", "sample_row": "{\"gem_id\": \"\\\"fr-de-train-1275594\\\"\", \"gem_parent_id\": \"\\\"fr-de-train-1275594\\\"\", \"id\": \"\\\"1275594\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"Origine.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\"]\", \"src_document.content\": \"[\\\"Dans un pays o\\\\u00f9 l'Oscar du meilleur film es...\", \"src_summary\": \"\\\"Alan Smithee (on rencontre aussi les formes Allen...\", \"tgt_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr-zh": {"config_name": "fr-zh", "sample_row": "{\"gem_id\": \"\\\"fr-zh-train-1595497\\\"\", \"gem_parent_id\": \"\\\"fr-zh-train-1595497\\\"\", \"id\": \"\\\"1595497\\\"\", \"src_title\": \"\\\"Antoine Meillet\\\"\", \"tgt_title\": \"\\\"\\\\u5b89\\\\u4e1c\\\\u5c3c\\\\u00b7\\\\u6885\\\\u8036\\\"\", \"src_document.title\": \"[\\\"Biographie.\\\", \\\"\\\\u00c9tudes hom\\\\u00e9riques.\\\"]\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"D'origine bourbonnaise, fils d'un notaire de Ch\\\\...\", \"src_summary\": \"\\\"Paul Jules Antoine Meillet, n\\\\u00e9 le \\\\u00e0 Mou...\", \"tgt_summary\": \"\\\"\\\\u5b89\\\\u4e1c\\\\u5c3c\\\\u00b7\\\\u6885\\\\u8036(1866\\\\u5e7411...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-en": {"config_name": "cs-en", "sample_row": "{\"gem_id\": \"\\\"cs-en-train-595004\\\"\", \"gem_parent_id\": \"\\\"cs-en-train-595004\\\"\", \"id\": \"\\\"595004\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"Astronomy\\\"\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"\\\"Astronomy (from ) is a natural science that studi...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-fr": {"config_name": "cs-fr", "sample_row": "{\"gem_id\": \"\\\"cs-fr-train-909261\\\"\", \"gem_parent_id\": \"\\\"cs-fr-train-909261\\\"\", \"id\": \"\\\"909261\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"Astronomie\\\"\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"\\\"L\\\\u2019astronomie est la science de l\\\\u2019observ...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-de": {"config_name": "cs-de", "sample_row": "{\"gem_id\": \"\\\"cs-de-train-38396\\\"\", \"gem_parent_id\": \"\\\"cs-de-train-38396\\\"\", \"id\": \"\\\"38396\\\"\", \"src_title\": \"\\\"Ak\\\\u010dn\\\\u00ed film\\\"\", \"tgt_title\": \"\\\"Actionfilm\\\"\", \"src_document.title\": \"[\\\"Prvn\\\\u00ed ak\\\\u010dn\\\\u00ed filmy.\\\", \\\"N\\\\u00e1stup...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"Za prvn\\\\u00ed ak\\\\u010dn\\\\u00ed sc\\\\u00e9nu b\\\\u00fd...\", \"src_summary\": \"\\\"Ak\\\\u010dn\\\\u00ed film je filmov\\\\u00fd \\\\u017e\\\\u00e1...\", \"tgt_summary\": \"\\\"Der Actionfilm (von engl. \\\\\\\"action\\\\\\\": Tat, Handlu...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs-zh": {"config_name": "cs-zh", "sample_row": "{\"gem_id\": \"\\\"cs-zh-train-241305\\\"\", \"gem_parent_id\": \"\\\"cs-zh-train-241305\\\"\", \"id\": \"\\\"241305\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"\\\"\\\\u5929\\\\u6587\\\\u5b78\\\"\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"\\\"\\\\u5929\\\\u6587\\\\u5b66\\\\u662f\\\\u4e00\\\\u95e8\\\\u81ea\\\\u7136\\\\...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-en": {"config_name": "de-en", "sample_row": "{\"gem_id\": \"\\\"de-en-train-1021816\\\"\", \"gem_parent_id\": \"\\\"de-en-train-1021816\\\"\", \"id\": \"\\\"1021816\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"Geschichte.\\\", \\\"Entstehung.\\\", \\\"Aufdeckung und Abk...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Das Pseudonym entstand 1968 infolge der Arbe...\", \"src_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\", \"tgt_summary\": \"\\\"Alan Smithee (also Allen Smithee) is an official ...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-fr": {"config_name": "de-fr", "sample_row": "{\"gem_id\": \"\\\"de-fr-train-1069456\\\"\", \"gem_parent_id\": \"\\\"de-fr-train-1069456\\\"\", \"id\": \"\\\"1069456\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"Alan Smithee\\\"\", \"src_document.title\": \"[\\\"Geschichte.\\\", \\\"Entstehung.\\\", \\\"Aufdeckung und Abk...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Das Pseudonym entstand 1968 infolge der Arbe...\", \"src_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\", \"tgt_summary\": \"\\\"Alan Smithee (on rencontre aussi les formes Allen...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-cs": {"config_name": "de-cs", "sample_row": "{\"gem_id\": \"\\\"de-cs-train-1599234\\\"\", \"gem_parent_id\": \"\\\"de-cs-train-1599234\\\"\", \"id\": \"\\\"1599234\\\"\", \"src_title\": \"\\\"Ang Lee\\\"\", \"tgt_title\": \"\\\"Ang Lee\\\"\", \"src_document.title\": \"[\\\"Leben.\\\", \\\"Filmisches Werk.\\\", \\\"1992\\\\u20131994: Di...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"Ang Lee wurde 1954 in Taiwan geboren. Seine Elte...\", \"src_summary\": \"\\\"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Landkr...\", \"tgt_summary\": \"\\\"Ang Lee (* 23. \\\\u0159\\\\u00edjna 1954, Pingtung, Tc...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de-zh": {"config_name": "de-zh", "sample_row": "{\"gem_id\": \"\\\"de-zh-train-387483\\\"\", \"gem_parent_id\": \"\\\"de-zh-train-387483\\\"\", \"id\": \"\\\"387483\\\"\", \"src_title\": \"\\\"Alan Smithee\\\"\", \"tgt_title\": \"\\\"\\\\u827e\\\\u502b\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f\\\"\", \"src_document.title\": \"[\\\"Geschichte.\\\", \\\"Entstehung.\\\", \\\"Aufdeckung und Abk...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Das Pseudonym entstand 1968 infolge der Arbe...\", \"src_summary\": \"\\\"Alan Smithee steht als Pseudonym f\\\\u00fcr einen f...\", \"tgt_summary\": \"\\\"\\\\u827e\\\\u4f26\\\\u00b7\\\\u53f2\\\\u5bc6\\\\u897f(\\\\u82f1\\\\u8bed...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-en": {"config_name": "zh-en", "sample_row": "{\"gem_id\": \"\\\"zh-en-train-2183211\\\"\", \"gem_parent_id\": \"\\\"zh-en-train-2183211\\\"\", \"id\": \"\\\"2183211\\\"\", \"src_title\": \"\\\"\\\\u9515\\\"\", \"tgt_title\": \"\\\"Actinium\\\"\", \"src_document.title\": \"[\\\"\\\\u5386\\\\u53f2.\\\", \\\"\\\\u5c5e\\\\u6027.\\\", \\\"\\\\u5316\\\\u5408\\\\u...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\"]...\", \"src_document.content\": \"[\\\"\\\\u6cd5\\\\u56fd\\\\u5316\\\\u5b66\\\\u5bb6\\\\u5b89\\\\u5fb7\\\\u70c8...\", \"src_summary\": \"\\\"\\\\u9515\\\\u662f\\\\u4e00\\\\u79cd\\\\u653e\\\\u5c04\\\\u6027\\\\u91d1\\\\...\", \"tgt_summary\": \"\\\"Actinium is a chemical element with the symbol Ac...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-fr": {"config_name": "zh-fr", "sample_row": "{\"gem_id\": \"\\\"zh-fr-train-1570465\\\"\", \"gem_parent_id\": \"\\\"zh-fr-train-1570465\\\"\", \"id\": \"\\\"1570465\\\"\", \"src_title\": \"\\\"\\\\u62bd\\\\u8c61\\\\u4ee3\\\\u6570\\\"\", \"tgt_title\": \"\\\"Alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale\\\"\", \"src_document.title\": \"[\\\"\\\\u5386\\\\u53f2.\\\", \\\"\\\\u65e9\\\\u671f\\\\u7684\\\\u7fa4\\\\u8bba....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\\u5982\\\\u540c\\\\u5176\\\\u4ed6\\\\u7684\\\\u6570\\\\u5b66\\\\u9886...\", \"src_summary\": \"\\\"\\\\u62bd\\\\u8c61\\\\u4ee3\\\\u6570\\\\u4f5c\\\\u4e3a\\\\u6570\\\\u5b66\\\\...\", \"tgt_summary\": \"\\\"L'alg\\\\u00e8bre g\\\\u00e9n\\\\u00e9rale, ou alg\\\\u00e8br...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-cs": {"config_name": "zh-cs", "sample_row": "{\"gem_id\": \"\\\"zh-cs-train-291204\\\"\", \"gem_parent_id\": \"\\\"zh-cs-train-291204\\\"\", \"id\": \"\\\"291204\\\"\", \"src_title\": \"\\\"\\\\u65c5\\\\u6e38\\\"\", \"tgt_title\": \"\\\"Turistika\\\"\", \"src_document.title\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb.\\\", \\\"\\\\u65c5\\\\u6e38\\\\u7406\\\\...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb\\\\u79f0\\\\u4e3a\\\\u65c5\\\\u5ba2...\", \"src_summary\": \"\\\"\\\\u65c5\\\\u6e38\\\\u5c31\\\\u662f\\\\u65c5\\\\u884c\\\\u6e38\\\\u89c8\\\\...\", \"tgt_summary\": \"\\\"Turistika je z\\\\u00e1jmov\\\\u00e1 \\\\u010dinnost (spor...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh-de": {"config_name": "zh-de", "sample_row": "{\"gem_id\": \"\\\"zh-de-train-709836\\\"\", \"gem_parent_id\": \"\\\"zh-de-train-709836\\\"\", \"id\": \"\\\"709836\\\"\", \"src_title\": \"\\\"\\\\u9515\\\"\", \"tgt_title\": \"\\\"Actinium\\\"\", \"src_document.title\": \"[\\\"\\\\u5386\\\\u53f2.\\\", \\\"\\\\u5c5e\\\\u6027.\\\", \\\"\\\\u5316\\\\u5408\\\\u...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\"]...\", \"src_document.content\": \"[\\\"\\\\u6cd5\\\\u56fd\\\\u5316\\\\u5b66\\\\u5bb6\\\\u5b89\\\\u5fb7\\\\u70c8...\", \"src_summary\": \"\\\"\\\\u9515\\\\u662f\\\\u4e00\\\\u79cd\\\\u653e\\\\u5c04\\\\u6027\\\\u91d1\\\\...\", \"tgt_summary\": \"\\\"Actinium ist ein radioaktives chemisches Element ...\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "en": {"config_name": "en", "sample_row": "{\"gem_id\": \"\\\"en-train-730484\\\"\", \"gem_parent_id\": \"\\\"en-train-730484\\\"\", \"id\": \"\\\"730484\\\"\", \"src_title\": \"\\\"Astronomy\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Etymology.\\\", \\\"Use of terms \\\\\\\"astronomy\\\\\\\" and \\\\\\\"a...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\\\\"Astronomy\\\\\\\" (from the Greek \\\\u1f00\\\\u03c3\\\\u03c4...\", \"src_summary\": \"\\\"Astronomy (from ) is a natural science that studi...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "fr": {"config_name": "fr", "sample_row": "{\"gem_id\": \"\\\"fr-train-37687\\\"\", \"gem_parent_id\": \"\\\"fr-train-37687\\\"\", \"id\": \"\\\"37687\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Histoire.\\\", \\\"N\\\\u00e9olithique.\\\", \\\"Antiquit\\\\u00e9...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\", \\\"3\\\", \\\"3\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"L'astronomie est consid\\\\u00e9r\\\\u00e9e comme la p...\", \"src_summary\": \"\\\"L\\\\u2019astronomie est la science de l\\\\u2019observ...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "cs": {"config_name": "cs", "sample_row": "{\"gem_id\": \"\\\"cs-train-909261\\\"\", \"gem_parent_id\": \"\\\"cs-train-909261\\\"\", \"id\": \"\\\"909261\\\"\", \"src_title\": \"\\\"Astronomie\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Historie astronomie.\\\", \\\"Antika.\\\", \\\"Novov\\\\u011bk....\", \"src_document.section_level\": \"[\\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"\\\", \\\"Astronomie se podobn\\\\u011b jako dal\\\\u0161\\\\u0...\", \"src_summary\": \"\\\"Astronomie, \\\\u0159ecky \\\\u03b1\\\\u03c3\\\\u03c4\\\\u03c1\\\\u...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "de": {"config_name": "de", "sample_row": "{\"gem_id\": \"\\\"de-train-1599234\\\"\", \"gem_parent_id\": \"\\\"de-train-1599234\\\"\", \"id\": \"\\\"1599234\\\"\", \"src_title\": \"\\\"Ang Lee\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"Leben.\\\", \\\"Filmisches Werk.\\\", \\\"1992\\\\u20131994: Di...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\"]\", \"src_document.content\": \"[\\\"Ang Lee wurde 1954 in Taiwan geboren. Seine Elte...\", \"src_summary\": \"\\\"Ang Lee (; * 23. Oktober 1954 in Chaozhou, Landkr...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}, "zh": {"config_name": "zh", "sample_row": "{\"gem_id\": \"\\\"zh-train-291204\\\"\", \"gem_parent_id\": \"\\\"zh-train-291204\\\"\", \"id\": \"\\\"291204\\\"\", \"src_title\": \"\\\"\\\\u65c5\\\\u6e38\\\"\", \"tgt_title\": \"null\", \"src_document.title\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb.\\\", \\\"\\\\u65c5\\\\u6e38\\\\u7406\\\\...\", \"src_document.section_level\": \"[\\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"1\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"2\\\", \\\"1\\\", \\\"2\\\",...\", \"src_document.content\": \"[\\\"\\\\u65c5\\\\u6e38\\\\u4eba\\\\u58eb\\\\u79f0\\\\u4e3a\\\\u65c5\\\\u5ba2...\", \"src_summary\": \"\\\"\\\\u65c5\\\\u6e38\\\\u5c31\\\\u662f\\\\u65c5\\\\u884c\\\\u6e38\\\\u89c8\\\\...\", \"tgt_summary\": \"null\"}", "columns": ["gem_id", "gem_parent_id", "id", "src_title", "tgt_title", "src_document_title", "src_document_section_level", "src_document_content", "src_summary", "tgt_summary"], "columns_mapping": {"gem_id": "gem_id", "gem_parent_id": "gem_parent_id", "id": "id", "src_title": "src_title", "tgt_title": "tgt_title", "src_document.title": "src_document_title", "src_document.section_level": "src_document_section_level", "src_document.content": "src_document_content", "src_summary": "src_summary", "tgt_summary": "tgt_summary"}, "dataset_description": "The XWikis Corpus (Perez-Beltrachini and Lapata, 2021) provides datasets with different language pairs and directions for cross-lingual abstractive document summarisation. This current version includes four languages: English, German, French, and Czech. The dataset is derived from Wikipedia. It is based on the observation that for a Wikipedia title, the lead section provides an overview conveying salient information, while the body provides detailed information. It thus assumes the body and lead paragraph as a document-summary pair. Furthermore, as a Wikipedia title can be associated with Wikipedia articles in various languages, 1) Wikipedia\u2019s Interlanguage Links are used to find titles across languages and 2) given any two related Wikipedia titles, e.g., Huile d\u2019Olive (French) and Olive Oil (English), the lead paragraph from one title is paired with the body of the other to derive cross-lingual pairs.\n", "dataset_name": "GEM/xwikis"}}, "tags": ["task_categories:summarization", "annotations_creators:found", "multilinguality:unknown", "source_datasets:original", "language:de", "language:en", "language:fr", "language:cs"], "is_gated": false}, "cfilt/iwn_wordlists": {"dataset_name": "cfilt/iwn_wordlists", "description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.", "downloads": 34, "configs": {"assamese": {"config_name": "assamese", "sample_row": "{\"word\": \"\\\"\\\\u09b8\\\\u0982\\\\u099c\\\\u09cd\\\\u099e\\\\u09be\\\\u09b6\\\\u09c2\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "bengali": {"config_name": "bengali", "sample_row": "{\"word\": \"\\\"\\\\u09b8\\\\u09c0\\\\u09ae\\\\u09a8\\\\u09cd\\\\u09a4\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "bodo": {"config_name": "bodo", "sample_row": "{\"word\": \"\\\"\\\\u0928\\\\u0916\\\\u0930_\\\\u0917\\\\u0948\\\\u092f\\\\u093f\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "gujarati": {"config_name": "gujarati", "sample_row": "{\"word\": \"\\\"\\\\u0aa6\\\\u0abe\\\\u0ab5\\\\u0abe\\\\u0a97\\\\u0acd\\\\u0aa8\\\\u0abf\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "hindi": {"config_name": "hindi", "sample_row": "{\"word\": \"\\\"\\\\u0916\\\\u093e\\\\u0938 \\\\u0924\\\\u094c\\\\u0930 \\\\u0938\\\\u094...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "kannada": {"config_name": "kannada", "sample_row": "{\"word\": \"\\\"\\\\u0cae\\\\u0cc3\\\\u0ca4\\\\u0ccd\\\\u0caf\\\\u0c82\\\\u0c9c\\\\u0caf\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "kashmiri": {"config_name": "kashmiri", "sample_row": "{\"word\": \"\\\"\\\\u062c\\\\u0654\\\\u0632\\\\u06cc\\\\u0656\\\\u0631\\\\u064f\\\\u06a9_...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "konkani": {"config_name": "konkani", "sample_row": "{\"word\": \"\\\"\\\\u0939\\\\u093f\\\\u0930\\\\u0923\\\\u094d\\\\u092f\\\\u0915\\\\u0936\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "malayalam": {"config_name": "malayalam", "sample_row": "{\"word\": \"\\\"\\\\u0d05\\\\u0d7c\\\\u0d39\\\\u0d24\\\\u0d15\\\\u0d3f\\\\u0d1f\\\\u0d4d\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "manipuri": {"config_name": "manipuri", "sample_row": "{\"word\": \"\\\"mmL_yAMlb \\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "marathi": {"config_name": "marathi", "sample_row": "{\"word\": \"\\\"\\\\u0932\\\\u093e\\\\u0935\\\\u093e\\\\u0932\\\\u093e\\\\u0935\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "meitei": {"config_name": "meitei", "sample_row": "{\"word\": \"\\\"\\\\uabc3\\\\uabe6\\\\uabdb\\\\uabc1\\\\uabe4\\\\uabc0\\\\uabe3\\\\uabed\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "nepali": {"config_name": "nepali", "sample_row": "{\"word\": \"\\\"\\\\u0906\\\\u0930\\\\u094b\\\\u092a\\\\u0923\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "oriya": {"config_name": "oriya", "sample_row": "{\"word\": \"\\\"\\\\u0b2d\\\\u0b42\\\\u0b24\\\\u0b2a\\\\u0b4d\\\\u0b30\\\\u0b47\\\\u0b24\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "punjabi": {"config_name": "punjabi", "sample_row": "{\"word\": \"\\\"\\\\u0a2e\\\\u0a71\\\\u0a16\\\"\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "sanskrit": {"config_name": "sanskrit", "sample_row": "{\"word\": \"\\\"\\\\u092d\\\\u0930\\\\u0926\\\\u094d\\\\u0935\\\\u093e\\\\u091c\\\\u0903\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "tamil": {"config_name": "tamil", "sample_row": "{\"word\": \"\\\"\\\\u0ba4\\\\u0bc1\\\\u0ba9\\\\u0bcd\\\\u0ba9\\\\u0bc2\\\\u0bb1\\\\u0bc1\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "telugu": {"config_name": "telugu", "sample_row": "{\"word\": \"\\\"\\\\u0c28\\\\u0c2e\\\\u0c4d\\\\u0c2e\\\\u0c26\\\\u0c17\\\\u0c3f\\\\u0c28\\\"...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}, "urdu": {"config_name": "urdu", "sample_row": "{\"word\": \"\\\"\\\\u0646\\\\u0627\\\\u06af\\\\u06cc\\\\u0634\\\\u0648\\\\u0631\\\\u060c\\\\...\"}", "columns": ["word"], "columns_mapping": {"word": "word"}, "dataset_description": "We provide the unique word list form the IndoWordnet (IWN) knowledge base.\n", "dataset_name": "cfilt/iwn_wordlists"}}, "tags": ["task_categories:token-classification", "annotations_creators:Shivam Mhaskar, Diptesh Kanojia", "multilinguality:monolingual", "source_datasets:original", "language:as", "language:bn", "language:mni", "language:gu", "language:hi", "language:kn", "language:ks", "language:kok", "language:ml", "language:mr", "language:or", "language:ne", "language:pa", "language:sa", "language:ta", "language:te", "language:ur", "abbreviation-detection"], "is_gated": false}, "SocialGrep/the-reddit-place-dataset": {"dataset_name": "SocialGrep/the-reddit-place-dataset", "description": "The written history or /r/Place, in posts and comments.", "downloads": 23, "configs": {"posts": {"config_name": "posts", "sample_row": "{\"type\": \"\\\"post\\\"\", \"id\": \"\\\"twh9v4\\\"\", \"subreddit.id\": \"\\\"2sxhs\\\"\", \"subreddit.name\": \"\\\"place\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1649116799\", \"permalink\": \"\\\"https://old.reddit.com/r/place/comments/twh9v4/is...\", \"domain\": \"\\\"i.redd.it\\\"\", \"url\": \"\\\"https://i.redd.it/0kyey4qeplr81.jpg\\\"\", \"selftext\": \"\\\"\\\"\", \"title\": \"\\\"Is this a glitch? What is up with r/place?\\\"\", \"score\": \"8\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "domain", "url", "selftext", "title", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "domain": "domain", "url": "url", "selftext": "selftext", "title": "title", "score": "score"}, "dataset_description": "The written history or /r/Place, in posts and comments.\n", "dataset_name": "SocialGrep/the-reddit-place-dataset"}, "comments": {"config_name": "comments", "sample_row": "{\"type\": \"1\", \"id\": \"\\\"i3f9n12\\\"\", \"subreddit.id\": \"\\\"2sxhs\\\"\", \"subreddit.name\": \"\\\"place\\\"\", \"subreddit.nsfw\": \"false\", \"created_utc\": \"1649116799\", \"permalink\": \"\\\"https://old.reddit.com/r/place/comments/twdn7y/sp...\", \"body\": \"\\\"[removed]\\\"\", \"sentiment\": \"null\", \"score\": \"1\"}", "columns": ["type", "id", "subreddit_id", "subreddit_name", "subreddit_nsfw", "created_utc", "permalink", "body", "sentiment", "score"], "columns_mapping": {"type": "type", "id": "id", "subreddit.id": "subreddit_id", "subreddit.name": "subreddit_name", "subreddit.nsfw": "subreddit_nsfw", "created_utc": "created_utc", "permalink": "permalink", "body": "body", "sentiment": "sentiment", "score": "score"}, "dataset_description": "The written history or /r/Place, in posts and comments.\n", "dataset_name": "SocialGrep/the-reddit-place-dataset"}}, "tags": ["annotations_creators:lexyr", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "StanBienaives/french-open-fiscal-texts": {"dataset_name": "StanBienaives/french-open-fiscal-texts", "description": " This dataset is an extraction from the OPENDATA/JADE. A list of case laws from the French court \"Conseil d'Etat\".", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"CAA de PARIS, 9\\\\u00e8me chambre, 08/02/2018, 17PA...\", \"content\": \"\\\"Vu la proc\\\\u00e9dure suivante :\\\\n\\\\n Proc\\\\u0...\", \"summary\": \"\\\"\\\"\", \"solution\": \"\\\"\\\"\", \"numero\": \"\\\"17PA01570\\\"\", \"publi_receuil\": \"\\\"C\\\"\", \"date\": \"\\\"2018-02-08\\\"\"}", "columns": ["title", "content", "summary", "solution", "numero", "publi_receuil", "date"], "columns_mapping": {"title": "title", "content": "content", "summary": "summary", "solution": "solution", "numero": "numero", "publi_receuil": "publi_receuil", "date": "date"}, "dataset_description": " This dataset is an extraction from the OPENDATA/JADE. A list of case laws from the French court \"Conseil d'Etat\".\n", "dataset_name": "StanBienaives/french-open-fiscal-texts"}}, "tags": ["task_categories:summarization", "task_categories:feature-extraction", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original"], "is_gated": false}, "McGill-NLP/TopiOCQA": {"dataset_name": "McGill-NLP/TopiOCQA", "description": "TopiOCQA is an information-seeking conversational dataset with challenging topic switching phenomena.", "downloads": 304, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"Conversation_no\": \"1\", \"Turn_no\": \"1\", \"Question\": \"\\\"what was australia's contribution to the battle o...\", \"Answer\": \"\\\"The army personnel and thousands of Australian ai...\", \"Topic\": \"\\\"Australian contribution to the Battle of Normandy...\", \"Topic_section\": \"\\\"Introduction\\\"\", \"Rationale\": \"\\\" The army personnel and thousands of Australian a...\", \"is_nq\": \"false\", \"Context\": \"[]\", \"Additional_answers.Answer\": \"[]\", \"Additional_answers.Topic\": \"[]\", \"Additional_answers.Topic_section\": \"[]\", \"Additional_answers.Rationale\": \"[]\", \"Gold_passage.id\": \"\\\"wiki:5498209\\\"\", \"Gold_passage.title\": \"\\\"Australian contribution to the Battle of Normandy...\", \"Gold_passage.text\": \"\\\"Australian personnel also took part in the invasi...\"}", "columns": ["Conversation_no", "Turn_no", "Question", "Answer", "Topic", "Topic_section", "Rationale", "is_nq", "Context", "Additional_answers_Answer", "Additional_answers_Topic", "Additional_answers_Topic_section", "Additional_answers_Rationale", "Gold_passage_id", "Gold_passage_title", "Gold_passage_text"], "columns_mapping": {"Conversation_no": "Conversation_no", "Turn_no": "Turn_no", "Question": "Question", "Answer": "Answer", "Topic": "Topic", "Topic_section": "Topic_section", "Rationale": "Rationale", "is_nq": "is_nq", "Context": "Context", "Additional_answers.Answer": "Additional_answers_Answer", "Additional_answers.Topic": "Additional_answers_Topic", "Additional_answers.Topic_section": "Additional_answers_Topic_section", "Additional_answers.Rationale": "Additional_answers_Rationale", "Gold_passage.id": "Gold_passage_id", "Gold_passage.title": "Gold_passage_title", "Gold_passage.text": "Gold_passage_text"}, "dataset_description": "TopiOCQA is an information-seeking conversational dataset with challenging topic switching phenomena.\n", "dataset_name": "McGill-NLP/TopiOCQA"}}, "tags": ["task_categories:text-retrieval", "task_categories:text-generation", "task_ids:language-modeling", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en", "conversational-question-answering"], "is_gated": false}, "taln-ls2n/inspec": {"dataset_name": "taln-ls2n/inspec", "description": "Inspec benchmark dataset for keyphrase extraction an generation.", "downloads": 71, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"761\\\"\", \"title\": \"\\\"Towards a NMR implementation of a quantum lattice...\", \"abstract\": \"\\\"Recent theoretical results suggest that an array ...\", \"keyphrases\": \"[\\\"NMR implementation\\\", \\\"quantum lattice gas algori...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "Inspec benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/inspec"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "taln-ls2n/kp20k": {"dataset_name": "taln-ls2n/kp20k", "description": "KP20k dataset for keyphrase extraction and generation in scientific paper.", "downloads": 32, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"vXFe8Vy\\\"\", \"title\": \"\\\"virtually enhancing the perception of user action...\", \"abstract\": \"\\\"This paper proposes using virtual reality to enha...\", \"keyphrases\": \"[\\\"animation\\\", \\\"avatars\\\", \\\"telepresence\\\", \\\"applicat...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"R\\\", \\\"M\\\"]\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu"}, "dataset_description": "KP20k dataset for keyphrase extraction and generation in scientific paper.\n", "dataset_name": "taln-ls2n/kp20k"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en", "keyphrase-generation", "keyphrase-extraction", "text-mining"], "is_gated": false}, "conceptual_captions": {"dataset_name": "conceptual_captions", "description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.", "downloads": 2447, "configs": {"unlabeled": {"config_name": "unlabeled", "sample_row": "{\"image_url\": \"\\\"http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/AAA...\", \"caption\": \"\\\"a very typical bus station\\\"\"}", "columns": ["image_url", "caption"], "columns_mapping": {"image_url": "image_url", "caption": "caption"}, "dataset_description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.\n", "dataset_name": "conceptual_captions"}, "labeled": {"config_name": "labeled", "sample_row": "{\"image_url\": \"\\\"https://thumb1.shutterstock.com/display_pic_with_...\", \"caption\": \"\\\"christmas tree on a black background .\\\"\", \"labels\": \"[\\\"christmas tree\\\", \\\"christmas decoration\\\", \\\"font\\\",...\", \"MIDs\": \"[\\\"/m/025nd\\\", \\\"/m/05fc9mj\\\", \\\"/m/03gq5hm\\\", \\\"/m/07s6n...\", \"confidence_scores\": \"[0.9818305373191833, 0.952756941318512, 0.92273795...\"}", "columns": ["image_url", "caption", "labels", "MIDs", "confidence_scores"], "columns_mapping": {"image_url": "image_url", "caption": "caption", "labels": "labels", "MIDs": "MIDs", "confidence_scores": "confidence_scores"}, "dataset_description": "Google's Conceptual Captions dataset has more than 3 million images, paired with natural-language captions.\nIn contrast with the curated style of the MS-COCO images, Conceptual Captions images and their raw descriptions are harvested from the web,\nand therefore represent a wider variety of styles. The raw descriptions are harvested from the Alt-text HTML attribute associated with web images.\nThe authors developed an automatic pipeline that extracts, filters, and transforms candidate image/caption pairs, with the goal of achieving a balance of cleanliness,\ninformativeness, fluency, and learnability of the resulting captions.\n", "dataset_name": "conceptual_captions"}}, "tags": ["task_categories:image-to-text", "task_ids:image-captioning", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "conceptual_12m": {"dataset_name": "conceptual_12m", "description": "Conceptual 12M is a large-scale dataset of 12 million\nimage-text pairs specifically meant to be used for visionand-language pre-training.\nIts data collection pipeline is a relaxed version of the one used in Conceptual Captions 3M.", "downloads": 134, "configs": {"default": {"config_name": "default", "sample_row": "{\"image_url\": \"\\\"https://chairish-prod.freetls.fastly.net/image/pr...\", \"caption\": \"\\\"Metal Design Within Reach Ivory Slipper Chairs - ...\"}", "columns": ["image_url", "caption"], "columns_mapping": {"image_url": "image_url", "caption": "caption"}, "dataset_description": "Conceptual 12M is a large-scale dataset of 12 million\nimage-text pairs specifically meant to be used for visionand-language pre-training.\nIts data collection pipeline is a relaxed version of the one used in Conceptual Captions 3M.\n", "dataset_name": "conceptual_12m"}}, "tags": ["task_categories:image-to-text", "task_ids:image-captioning", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "surrey-nlp/PLOD-filtered": {"dataset_name": "surrey-nlp/PLOD-filtered", "description": "This is the dataset repository for PLOD Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Abbreviation Detection.", "downloads": 44, "configs": {"PLODfiltered": {"config_name": "PLODfiltered", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Alternatively\\\", \\\",\\\", \\\"fibroblasts\\\", \\\"were\\\", \\\"pla...\", \"pos_tags\": \"[2, 13, 8, 3, 16, 2, 14, 14, 11, 3, 10, 16, 6, 0, ...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "pos_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "ner_tags": "ner_tags"}, "dataset_description": "\nThis is the dataset repository for PLOD Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Abbreviation Detection.\n", "dataset_name": "surrey-nlp/PLOD-filtered"}}, "tags": ["task_categories:token-classification", "annotations_creators:Leonardo Zilio, Hadeel Saadany, Prashant Sharma, Diptesh Kanojia, Constantin Orasan", "multilinguality:monolingual", "source_datasets:original", "language:en", "abbreviation-detection"], "is_gated": false}, "Divyanshu/indicxnli": {"dataset_name": "Divyanshu/indicxnli", "description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).", "downloads": 682, "configs": {"hi": {"config_name": "hi", "sample_row": "{\"premise\": \"\\\"\\\\u0905\\\\u0935\\\\u0927\\\\u093e\\\\u0930\\\\u0923\\\\u093e\\\\u0924\\\\...\", \"hypothesis\": \"\\\"\\\\u0909\\\\u0924\\\\u094d\\\\u092a\\\\u093e\\\\u0926 \\\\u0914\\\\u0930...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "bn": {"config_name": "bn", "sample_row": "{\"premise\": \"\\\"\\\\u09a7\\\\u09be\\\\u09b0\\\\u09a3\\\\u09be\\\\u0997\\\\u09a4\\\\u09ad\\\\...\", \"hypothesis\": \"\\\"\\\\u09aa\\\\u09a3\\\\u09cd\\\\u09af \\\\u098f\\\\u09ac\\\\u0982 \\\\u09a...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "mr": {"config_name": "mr", "sample_row": "{\"premise\": \"\\\"\\\\u0938\\\\u0902\\\\u0915\\\\u0932\\\\u094d\\\\u092a\\\\u0928\\\\u093e\\\\...\", \"hypothesis\": \"\\\"\\\\u0909\\\\u0924\\\\u094d\\\\u092a\\\\u093e\\\\u0926\\\\u0928 \\\\u0906...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "as": {"config_name": "as", "sample_row": "{\"premise\": \"\\\"\\\\u09ad\\\\u09be\\\\u09f1\\\\u09bf\\\\u0995\\\\u09ad\\\\u09be\\\\u09ac\\\\...\", \"hypothesis\": \"\\\"\\\\u0989\\\\u09ce\\\\u09aa\\\\u09be\\\\u09a6\\\\u09bf\\\\u09a4 \\\\u09ac...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "ta": {"config_name": "ta", "sample_row": "{\"premise\": \"\\\"\\\\u0b95\\\\u0bb0\\\\u0bc1\\\\u0ba4\\\\u0bcd\\\\u0ba4\\\\u0bbf\\\\u0baf\\\\...\", \"hypothesis\": \"\\\"\\\\u0ba4\\\\u0baf\\\\u0bbe\\\\u0bb0\\\\u0bbf\\\\u0baa\\\\u0bcd\\\\u0baa\\\\...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "te": {"config_name": "te", "sample_row": "{\"premise\": \"\\\"\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c40\\\\u0c2e\\\\u0c4d \\\\u0c38\\\\u0c4d...\", \"hypothesis\": \"\\\"\\\\u0c09\\\\u0c24\\\\u0c4d\\\\u0c2a\\\\u0c24\\\\u0c4d\\\\u0c24\\\\u0c3f ...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "or": {"config_name": "or", "sample_row": "{\"premise\": \"\\\"\\\\u0b15\\\\u0b4d\\\\u0b30\\\\u0b3f\\\\u0b2e \\\\u0b38\\\\u0b4d\\\\u0b15...\", \"hypothesis\": \"\\\"\\\\u0b09\\\\u0b24\\\\u0b4d\\\\u0b2a\\\\u0b3e\\\\u0b26 \\\\u0b0f\\\\u0b2c...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "ml": {"config_name": "ml", "sample_row": "{\"premise\": \"\\\"\\\\u0d06\\\\u0d36\\\\u0d2f\\\\u0d2a\\\\u0d30\\\\u0d2e\\\\u0d3e\\\\u0d2f\\\\...\", \"hypothesis\": \"\\\"\\\\u0d09\\\\u0d7d\\\\u0d2a\\\\u0d4d\\\\u0d2a\\\\u0d28\\\\u0d4d\\\\u0d28\\\\...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "pa": {"config_name": "pa", "sample_row": "{\"premise\": \"\\\"\\\\u0a38\\\\u0a3f\\\\u0a27\\\\u0a3e\\\\u0a02\\\\u0a24\\\\u0a15 \\\\u0a24...\", \"hypothesis\": \"\\\"\\\\u0a09\\\\u0a24\\\\u0a2a\\\\u0a3e\\\\u0a26 \\\\u0a05\\\\u0a24\\\\u0a47...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "gu": {"config_name": "gu", "sample_row": "{\"premise\": \"\\\"\\\\u0ab5\\\\u0abf\\\\u0aad\\\\u0abe\\\\u0ab5\\\\u0aa8\\\\u0abe\\\\u0aa4\\\\...\", \"hypothesis\": \"\\\"\\\\u0a89\\\\u0aa4\\\\u0acd\\\\u0aaa\\\\u0abe\\\\u0aa6\\\\u0aa8 \\\\u0a85...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}, "kn": {"config_name": "kn", "sample_row": "{\"premise\": \"\\\"\\\\u0caa\\\\u0cb0\\\\u0cbf\\\\u0c95\\\\u0cb2\\\\u0ccd\\\\u0caa\\\\u0ca8\\\\...\", \"hypothesis\": \"\\\"\\\\u0c89\\\\u0ca4\\\\u0ccd\\\\u0caa\\\\u0ca8\\\\u0ccd\\\\u0ca8 \\\\u0cae...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "IndicXNLI is a translated version of XNLI to 11 Indic Languages. As with XNLI, the goal is\nto predict textual entailment (does sentence A imply/contradict/neither sentence\nB) and is a classification task (given two sentences, predict one of three\nlabels).\n", "dataset_name": "Divyanshu/indicxnli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:original", "language:as", "language:bn", "language:gu", "language:hi", "language:kn", "language:ml", "language:mr", "language:or", "language:pa", "language:ta", "language:te"], "is_gated": false}, "Yaxin/SemEval2016Task5Raw": {"dataset_name": "Yaxin/SemEval2016Task5Raw", "description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.", "downloads": 64, "configs": {"All": {"config_name": "All", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"language\": \"\\\"english\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_english": {"config_name": "restaurants_english", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"language\": \"\\\"english\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_french": {"config_name": "restaurants_french", "sample_row": "{\"text\": \"\\\"Un service passable .. Des plats surcuits, des sa...\", \"opinions\": \"[{\\\"target\\\": \\\"service\\\", \\\"category\\\": \\\"SERVICE#GENERA...\", \"language\": \"\\\"french\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"g1079435-d3498474-r271346275\\\"\", \"sentenceId\": \"\\\"g1079435-d3498474-r271346275:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_spanish": {"config_name": "restaurants_spanish", "sample_row": "{\"text\": \"\\\"Nos sentimos muy a gusto.\\\"\", \"opinions\": \"[{\\\"target\\\": \\\"NULL\\\", \\\"category\\\": \\\"RESTAURANT#GENERA...\", \"language\": \"\\\"spanish\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"es_9reinas_10_JordiCollGranell_2014-09-21\\\"\", \"sentenceId\": \"\\\"es_9reinas_10_JordiCollGranell_2014-09-21:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_russian": {"config_name": "restaurants_russian", "sample_row": "{\"text\": \"\\\"\\\\u0414\\\\u043e\\\\u0431\\\\u0440\\\\u044b\\\\u0439 \\\\u0447\\\\u0430...\", \"opinions\": \"[]\", \"language\": \"\\\"russian\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"27925\\\"\", \"sentenceId\": \"\\\"27925:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_dutch": {"config_name": "restaurants_dutch", "sample_row": "{\"text\": \"\\\"Lange wachttijd.\\\"\", \"opinions\": \"[{\\\"target\\\": \\\"wachttijd\\\", \\\"category\\\": \\\"SERVICE#GENE...\", \"language\": \"\\\"dutch\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"Review-g1006565-d2066794_1\\\"\", \"sentenceId\": \"\\\"Review-g1006565-d2066794_1:1\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "restaurants_turkish": {"config_name": "restaurants_turkish", "sample_row": "{\"text\": \"\\\"Manzara sahane evet ama servis rezalet.\\\"\", \"opinions\": \"[{\\\"target\\\": \\\"servis\\\", \\\"category\\\": \\\"SERVICE#GENERAL...\", \"language\": \"\\\"turkish\\\"\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1000\\\"\", \"sentenceId\": \"\\\"1000:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "hotels_arabic": {"config_name": "hotels_arabic", "sample_row": "{\"text\": \"\\\"\\\\u0623\\\\u0646\\\\u0635\\\\u062d \\\\u0628\\\\u0627\\\\u0644\\\\u0646...\", \"opinions\": \"[{\\\"target\\\": \\\"\\\\u0645\\\\u0648\\\\u0642\\\\u0639\\\", \\\"category\\\"...\", \"language\": \"\\\"arabic\\\"\", \"domain\": \"\\\"hotels\\\"\", \"reviewId\": \"\\\"456\\\"\", \"sentenceId\": \"\\\"456:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "mobilephones_dutch": {"config_name": "mobilephones_dutch", "sample_row": "{\"text\": \"\\\"Ik zou deze gsm ten sterkste aanbevelen ik was la...\", \"opinions\": \"[{\\\"target\\\": \\\"\\\", \\\"category\\\": \\\"BATTERY#OPERATION_PER...\", \"language\": \"\\\"dutch\\\"\", \"domain\": \"\\\"mobilephones\\\"\", \"reviewId\": \"\\\"Huawei_Ascend_G6_4G_4\\\"\", \"sentenceId\": \"\\\"Huawei_Ascend_G6_4G_4:1\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "mobilephones_chinese": {"config_name": "mobilephones_chinese", "sample_row": "{\"text\": \"\\\"\\\\u4eca\\\\u5929\\\\u6709\\\\u5e78\\\\u62ff\\\\u5230\\\\u4e86\\\\u6e2f\\\\...\", \"opinions\": \"[]\", \"language\": \"\\\"chinese\\\"\", \"domain\": \"\\\"mobilephones\\\"\", \"reviewId\": \"\\\"1\\\"\", \"sentenceId\": \"\\\"1:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "laptops_english": {"config_name": "laptops_english", "sample_row": "{\"text\": \"\\\"Being a PC user my whole life....\\\"\", \"opinions\": \"[]\", \"language\": \"\\\"english\\\"\", \"domain\": \"\\\"laptops\\\"\", \"reviewId\": \"\\\"79\\\"\", \"sentenceId\": \"\\\"79:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}, "digitalcameras_chinese": {"config_name": "digitalcameras_chinese", "sample_row": "{\"text\": \"\\\"\\\\u5343\\\\u547c\\\\u4e07\\\\u5524\\\\u59cb\\\\u51fa\\\\u6765\\\\uff0c\\\"...\", \"opinions\": \"[]\", \"language\": \"\\\"chinese\\\"\", \"domain\": \"\\\"digitalcameras\\\"\", \"reviewId\": \"\\\"1\\\"\", \"sentenceId\": \"\\\"1:0\\\"\"}", "columns": ["text", "opinions", "language", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "language": "language", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2016 specifically designed to aid research in multilingual Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2016Task5Raw"}}, "tags": [], "is_gated": false}, "Yaxin/SemEval2015Task12Raw": {"dataset_name": "Yaxin/SemEval2015Task12Raw", "description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.", "downloads": 14, "configs": {"All": {"config_name": "All", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2015Task12Raw"}, "restaurants": {"config_name": "restaurants", "sample_row": "{\"text\": \"\\\"Judging from previous posts this used to be a goo...\", \"opinions\": \"[{\\\"target\\\": \\\"place\\\", \\\"category\\\": \\\"RESTAURANT#GENER...\", \"domain\": \"\\\"restaurants\\\"\", \"reviewId\": \"\\\"1004293\\\"\", \"sentenceId\": \"\\\"1004293:0\\\"\"}", "columns": ["text", "opinions", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2015Task12Raw"}, "laptops": {"config_name": "laptops", "sample_row": "{\"text\": \"\\\"Being a PC user my whole life....\\\"\", \"opinions\": \"[]\", \"domain\": \"\\\"laptops\\\"\", \"reviewId\": \"\\\"79\\\"\", \"sentenceId\": \"\\\"79:0\\\"\"}", "columns": ["text", "opinions", "domain", "reviewId", "sentenceId"], "columns_mapping": {"text": "text", "opinions": "opinions", "domain": "domain", "reviewId": "reviewId", "sentenceId": "sentenceId"}, "dataset_description": "A collection of SemEval2015 specifically designed to aid research in Aspect Based Sentiment Analysis.\n", "dataset_name": "Yaxin/SemEval2015Task12Raw"}}, "tags": [], "is_gated": false}, "cfilt/HiNER-collapsed": {"dataset_name": "cfilt/HiNER-collapsed", "description": "This is the repository for HiNER - a large Hindi Named Entity Recognition dataset.", "downloads": 61, "configs": {"HiNER-Collapsed": {"config_name": "HiNER-Collapsed", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u0907\\\\u0938\\\", \\\"\\\\u0915\\\\u093c\\\\u093e\\\\u0928\\\\u0942\\\\u...\", \"ner_tags\": \"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThis is the repository for HiNER - a large Hindi Named Entity Recognition dataset.\n", "dataset_name": "cfilt/HiNER-collapsed"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:hi"], "is_gated": false}, "taln-ls2n/semeval-2010-pre": {"dataset_name": "taln-ls2n/semeval-2010-pre", "description": "Preprocessed SemEval-2010 Benchmark dataset for Keyphrase Generation.", "downloads": 38, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"J-39\\\"\", \"title\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"abstract\": \"\\\"Bidders on eBay have no dominant bidding strategy...\", \"keyphrases\": \"[\\\"sequenti auction problem\\\", \\\"empir analysi\\\", \\\"bid...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"U\\\", \\\"M\\\",...\", \"lvl-1\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"lvl-2\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"lvl-3\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\", \"lvl-4\": \"\\\"The Sequential Auction Problem on eBay: An Empiri...\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu", "lvl-1", "lvl-2", "lvl-3", "lvl-4"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu", "lvl-1": "lvl-1", "lvl-2": "lvl-2", "lvl-3": "lvl-3", "lvl-4": "lvl-4"}, "dataset_description": "Preprocessed SemEval-2010 Benchmark dataset for Keyphrase Generation.\n", "dataset_name": "taln-ls2n/semeval-2010-pre"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "McGill-NLP/FaithDial": {"dataset_name": "McGill-NLP/FaithDial", "description": "FaithDial is a new benchmark for hallucination-free dialogues, created by manually editing hallucinated and uncooperative responses in Wizard of Wikipedia.", "downloads": 929, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"dialog_idx\": \"0\", \"response\": \"\\\"Yeah, but once the access to the internet was a r...\", \"original_response\": \"\\\"No I could not! I couldn't imagine living when in...\", \"history\": \"[\\\"Can you imagine the world without internet acces...\", \"knowledge\": \"\\\"Internet access was once rare, but has grown rapi...\", \"BEGIN\": \"[\\\"Hallucination\\\"]\", \"VRM\": \"[\\\"Disclosure\\\", \\\"Ack.\\\"]\"}", "columns": ["dialog_idx", "response", "original_response", "history", "knowledge", "BEGIN", "VRM"], "columns_mapping": {"dialog_idx": "dialog_idx", "response": "response", "original_response": "original_response", "history": "history", "knowledge": "knowledge", "BEGIN": "BEGIN", "VRM": "VRM"}, "dataset_description": "FaithDial is a new benchmark for hallucination-free dialogues, created by manually editing hallucinated and uncooperative responses in Wizard of Wikipedia.\n", "dataset_name": "McGill-NLP/FaithDial"}}, "tags": ["task_categories:conversational", "task_categories:text-generation", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en", "faithful-dialogue-modeling", "trustworthy-dialogue-modeling"], "is_gated": false}, "cfilt/HiNER-original": {"dataset_name": "cfilt/HiNER-original", "description": "This is the dataset repository for HiNER Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Named Entity Recognitin for the Hindi language.", "downloads": 266, "configs": {"HiNER": {"config_name": "HiNER", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u0907\\\\u0938\\\", \\\"\\\\u0915\\\\u093c\\\\u093e\\\\u0928\\\\u0942\\\\u...\", \"ner_tags\": \"[22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 2...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "\nThis is the dataset repository for HiNER Dataset accepted to be published at LREC 2022.\nThe dataset can help build sequence labelling models for the task Named Entity Recognitin for the Hindi language.\n", "dataset_name": "cfilt/HiNER-original"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:hi"], "is_gated": false}, "AmazonScience/massive": {"dataset_name": "AmazonScience/massive", "description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.", "downloads": 9770, "configs": {"af-ZA": {"config_name": "af-ZA", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"af-ZA\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"maak my wakker nege-uur v. m. op vrydag\\\"\", \"annot_utt\": \"\\\"maak my wakker [time : nege-uur v. m.] op [date :...\", \"worker_id\": \"\\\"20\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"40\\\", \\\"49\\\", \\\"20\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "am-ET": {"config_name": "am-ET", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"am-ET\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u12a0\\\\u122d\\\\u1265 \\\\u12d8\\\\u1320\\\\u129d \\\\u12a4. \\\\u1...\", \"annot_utt\": \"\\\"[date : \\\\u12a0\\\\u122d\\\\u1265] [time : \\\\u12d8\\\\u1320\\\\...\", \"worker_id\": \"\\\"18\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"44\\\", \\\"20\\\", \\\"55\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target|english\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ar-SA": {"config_name": "ar-SA", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ar-SA\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0635\\\\u062d\\\\u064a\\\\u0646\\\\u064a \\\\u062a\\\\u0633\\\\u0639...\", \"annot_utt\": \"\\\"\\\\u0635\\\\u062d\\\\u064a\\\\u0646\\\\u064a [time : \\\\u062a\\\\u06...\", \"worker_id\": \"\\\"31\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"31\\\", \\\"19\\\", \\\"20\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "az-AZ": {"config_name": "az-AZ", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"az-AZ\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"c\\\\u00fcm\\\\u0259 g\\\\u00fcn\\\\u00fc s\\\\u0259h\\\\u0259r saa...\", \"annot_utt\": \"\\\"[date : c\\\\u00fcm\\\\u0259 g\\\\u00fcn\\\\u00fc] [time : s\\\\...\", \"worker_id\": \"\\\"14\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"14\\\", \\\"29\\\", \\\"7\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "bn-BD": {"config_name": "bn-BD", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"bn-BD\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0986\\\\u09ae\\\\u09be\\\\u0995\\\\u09c7 \\\\u09b6\\\\u09c1\\\\u0995...\", \"annot_utt\": \"\\\"\\\\u0986\\\\u09ae\\\\u09be\\\\u0995\\\\u09c7 [date : \\\\u09b6\\\\u09...\", \"worker_id\": \"\\\"19\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"0\\\", \\\"12\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ca-ES": {"config_name": "ca-ES", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ca-ES\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"desperta'm a les nou a. m. del divendres\\\"\", \"annot_utt\": \"\\\"desperta'm a les [time : nou a. m.] del [date : d...\", \"worker_id\": \"\\\"42\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"42\\\", \\\"30\\\", \\\"3\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target|english\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "cy-GB": {"config_name": "cy-GB", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"cy-GB\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"deffra fi am naw y bore ar dydd gwener\\\"\", \"annot_utt\": \"\\\"deffra fi am [time : naw y bore] ar [date : dydd ...\", \"worker_id\": \"\\\"8\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"4\\\", \\\"1\\\", \\\"5\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "da-DK": {"config_name": "da-DK", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"da-DK\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"v\\\\u00e6k mig klokken ni fredag\\\"\", \"annot_utt\": \"\\\"v\\\\u00e6k mig klokken [time : ni] [date : fredag]\\\"...\", \"worker_id\": \"\\\"6\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"19\\\", \\\"6\\\", \\\"17\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "de-DE": {"config_name": "de-DE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"de-DE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"weck mich am freitag um neun uhr auf\\\"\", \"annot_utt\": \"\\\"weck mich am [date : freitag] um [time : neun uhr...\", \"worker_id\": \"\\\"18\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"28\\\", \\\"8\\\", \\\"18\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "el-GR": {"config_name": "el-GR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"el-GR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u03be\\\\u03cd\\\\u03c0\\\\u03bd\\\\u03b1 \\\\u03bc\\\\u03b5 \\\\u03c...\", \"annot_utt\": \"\\\"\\\\u03be\\\\u03cd\\\\u03c0\\\\u03bd\\\\u03b1 \\\\u03bc\\\\u03b5 \\\\u03c...\", \"worker_id\": \"\\\"30\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"28\\\", \\\"68\\\", \\\"23\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "en-US": {"config_name": "en-US", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"en-US\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"wake me up at nine am on friday\\\"\", \"annot_utt\": \"\\\"wake me up at [time : nine am] on [date : friday]...\", \"worker_id\": \"\\\"1\\\"\", \"slot_method.slot\": \"[]\", \"slot_method.method\": \"[]\", \"judgments.worker_id\": \"[]\", \"judgments.intent_score\": \"[]\", \"judgments.slots_score\": \"[]\", \"judgments.grammar_score\": \"[]\", \"judgments.spelling_score\": \"[]\", \"judgments.language_identification\": \"[]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "es-ES": {"config_name": "es-ES", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"es-ES\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"despi\\\\u00e9rtame a las nueve de la ma\\\\u00f1ana el...\", \"annot_utt\": \"\\\"despi\\\\u00e9rtame a las [time : nueve de la ma\\\\u00...\", \"worker_id\": \"\\\"5\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"21\\\", \\\"5\\\", \\\"3\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "fa-IR": {"config_name": "fa-IR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"fa-IR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0645\\\\u0631\\\\u0627 \\\\u062c\\\\u0645\\\\u0639\\\\u0647 \\\\u063...\", \"annot_utt\": \"\\\"\\\\u0645\\\\u0631\\\\u0627 [date : \\\\u062c\\\\u0645\\\\u0639\\\\u06...\", \"worker_id\": \"\\\"3\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"8\\\", \\\"14\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "fi-FI": {"config_name": "fi-FI", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"fi-FI\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"her\\\\u00e4t\\\\u00e4 minut aamuyhdeks\\\\u00e4lt\\\\u00e4 p...\", \"annot_utt\": \"\\\"her\\\\u00e4t\\\\u00e4 minut [time : aamuyhdeks\\\\u00e4lt...\", \"worker_id\": \"\\\"17\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"17\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "fr-FR": {"config_name": "fr-FR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"fr-FR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"r\\\\u00e9veille-moi \\\\u00e0 neuf heures du matin le ...\", \"annot_utt\": \"\\\"r\\\\u00e9veille-moi \\\\u00e0 [time : neuf heures du m...\", \"worker_id\": \"\\\"22\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"11\\\", \\\"22\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[2, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "he-IL": {"config_name": "he-IL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"he-IL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u05db\\\\u05d5\\\\u05d5\\\\u05df \\\\u05d0\\\\u05ea \\\\u05d4\\\\u05e...\", \"annot_utt\": \"\\\"\\\\u05db\\\\u05d5\\\\u05d5\\\\u05df \\\\u05d0\\\\u05ea \\\\u05d4\\\\u05e...\", \"worker_id\": \"\\\"29\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"16\\\", \\\"3\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[2, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "hi-IN": {"config_name": "hi-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"hi-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0936\\\\u0941\\\\u0915\\\\u094d\\\\u0930\\\\u0935\\\\u093e\\\\u0930 ...\", \"annot_utt\": \"\\\"[date : \\\\u0936\\\\u0941\\\\u0915\\\\u094d\\\\u0930\\\\u0935\\\\u093...\", \"worker_id\": \"\\\"45\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"16\\\", \\\"3\\\", \\\"42\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "hu-HU": {"config_name": "hu-HU", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"hu-HU\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u00e9bressz fel reggel kilenckor p\\\\u00e9nteken\\\"...\", \"annot_utt\": \"\\\"\\\\u00e9bressz fel [time : reggel kilenckor] [date ...\", \"worker_id\": \"\\\"12\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"12\\\", \\\"28\\\", \\\"31\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "hy-AM": {"config_name": "hy-AM", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"hy-AM\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0561\\\\u0580\\\\u0569\\\\u0576\\\\u0561\\\\u0581\\\\u0580\\\\u0578\\\\...\", \"annot_utt\": \"\\\"\\\\u0561\\\\u0580\\\\u0569\\\\u0576\\\\u0561\\\\u0581\\\\u0580\\\\u0578\\\\...\", \"worker_id\": \"\\\"39\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"37\\\", \\\"11\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "id-ID": {"config_name": "id-ID", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"id-ID\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"bagunkan saya jam sembilan pagi hari jumat\\\"\", \"annot_utt\": \"\\\"bagunkan saya [time : jam sembilan pagi] hari [da...\", \"worker_id\": \"\\\"21\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"7\\\", \\\"15\\\", \\\"9\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "is-IS": {"config_name": "is-IS", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"is-IS\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"vekja mig klukkan n\\\\u00edu a\\\\u00f0 morgni \\\\u00e1 ...\", \"annot_utt\": \"\\\"vekja mig klukkan [time : n\\\\u00edu a\\\\u00f0 morgni...\", \"worker_id\": \"\\\"8\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"11\\\", \\\"20\\\", \\\"21\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "it-IT": {"config_name": "it-IT", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"it-IT\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"svegliami alle nove di mattina venerd\\\\u00ec\\\"\", \"annot_utt\": \"\\\"svegliami alle [time : nove] di mattina [date : v...\", \"worker_id\": \"\\\"34\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"40\\\", \\\"18\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ja-JP": {"config_name": "ja-JP", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ja-JP\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u91d1\\\\u66dc\\\\u65e5\\\\u306e\\\\u5348\\\\u524d\\\\u4e5d\\\\u6642\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u91d1\\\\u66dc\\\\u65e5] \\\\u306e [time : \\\\u5348...\", \"worker_id\": \"\\\"3\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"8\\\", \\\"5\\\", \\\"16\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "jv-ID": {"config_name": "jv-ID", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"jv-ID\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"gugah aku jam sanga esuk dina jumat\\\"\", \"annot_utt\": \"\\\"gugah aku jam [time : sanga esuk] dina [date : ju...\", \"worker_id\": \"\\\"9\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"16\\\", \\\"10\\\", \\\"17\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ka-GE": {"config_name": "ka-GE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ka-GE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u10d3\\\\u10d8\\\\u10da\\\\u10d8\\\\u10e1 \\\\u10ea\\\\u10ee\\\\u10e0...\", \"annot_utt\": \"\\\"[time : \\\\u10d3\\\\u10d8\\\\u10da\\\\u10d8\\\\u10e1 \\\\u10ea\\\\u10...\", \"worker_id\": \"\\\"42\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"38\\\", \\\"45\\\", \\\"17\\\"]\", \"judgments.intent_score\": \"[0, 0, 0]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[2, 1, 1]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "km-KH": {"config_name": "km-KH", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"km-KH\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u1798\\\\u17c9\\\\u17c4\\\\u1784 \\\\u1794\\\\u17d2\\\\u179a\\\\u17b6...\", \"annot_utt\": \"\\\"\\\\u1798\\\\u17c9\\\\u17c4\\\\u1784 [time : \\\\u1794\\\\u17d2\\\\u17...\", \"worker_id\": \"\\\"20\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"10\\\", \\\"29\\\", \\\"30\\\"]\", \"judgments.intent_score\": \"[1, 0, 1]\", \"judgments.slots_score\": \"[1, 2, 2]\", \"judgments.grammar_score\": \"[3, 0, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "kn-IN": {"config_name": "kn-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"kn-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0cb6\\\\u0cc1\\\\u0c95\\\\u0ccd\\\\u0cb0\\\\u0cb5\\\\u0cbe\\\\u0cb0 ...\", \"annot_utt\": \"\\\"[date : \\\\u0cb6\\\\u0cc1\\\\u0c95\\\\u0ccd\\\\u0cb0\\\\u0cb5\\\\u0cb...\", \"worker_id\": \"\\\"7\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"14\\\", \\\"7\\\", \\\"6\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ko-KR": {"config_name": "ko-KR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ko-KR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\uae08\\\\uc694\\\\uc77c \\\\uc624\\\\uc804 \\\\uc544\\\\ud649 \\\\uc2...\", \"annot_utt\": \"\\\"[date : \\\\uae08\\\\uc694\\\\uc77c] [time : \\\\uc624\\\\uc804 ...\", \"worker_id\": \"\\\"23\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"23\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "lv-LV": {"config_name": "lv-LV", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"lv-LV\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"piektdien pamodini mani devi\\\\u0146os no r\\\\u012bta...\", \"annot_utt\": \"\\\"[date : piektdien] pamodini mani [time : devi\\\\u01...\", \"worker_id\": \"\\\"23\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"localization\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"22\\\", \\\"9\\\", \\\"15\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ml-IN": {"config_name": "ml-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ml-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0d35\\\\u0d46\\\\u0d33\\\\u0d4d\\\\u0d33\\\\u0d3f\\\\u0d2f\\\\u0d3e\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u0d35\\\\u0d46\\\\u0d33\\\\u0d4d\\\\u0d33\\\\u0d3f\\\\u0d2...\", \"worker_id\": \"\\\"26\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"26\\\", \\\"23\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "mn-MN": {"config_name": "mn-MN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"mn-MN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u0430\\\\u0434 ...\", \"annot_utt\": \"\\\"[date : \\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u043...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"4\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 3, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ms-MY": {"config_name": "ms-MY", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ms-MY\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"kejutkan saya pada pukul sembilan pagi hari jumaa...\", \"annot_utt\": \"\\\"kejutkan saya pada pukul [time : sembilan pagi] h...\", \"worker_id\": \"\\\"12\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"9\\\", \\\"5\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "my-MM": {"config_name": "my-MM", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"my-MM\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u1004\\\\u102b\\\\u1037\\\\u1000\\\\u102d\\\\u102f \\\\u101e\\\\u1031...\", \"annot_utt\": \"\\\"\\\\u1004\\\\u102b\\\\u1037\\\\u1000\\\\u102d\\\\u102f [date : \\\\u10...\", \"worker_id\": \"\\\"33\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"90\\\", \\\"48\\\", \\\"39\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "nb-NO": {"config_name": "nb-NO", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"nb-NO\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"vekk meg ni null null p\\\\u00e5 fredag\\\"\", \"annot_utt\": \"\\\"vekk meg [time : ni null null] p\\\\u00e5 [date : fr...\", \"worker_id\": \"\\\"15\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"10\\\", \\\"19\\\", \\\"11\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "nl-NL": {"config_name": "nl-NL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"nl-NL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"maakt mijn wakker om negen uur in de ochtend op v...\", \"annot_utt\": \"\\\"maakt mijn wakker om [time : negen uur in de ocht...\", \"worker_id\": \"\\\"22\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"35\\\", \\\"34\\\", \\\"31\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 3]\", \"judgments.spelling_score\": \"[2, 1, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "pl-PL": {"config_name": "pl-PL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"pl-PL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"obud\\\\u017a mnie o dziewi\\\\u0105tej rano w pi\\\\u0105...\", \"annot_utt\": \"\\\"obud\\\\u017a mnie o [time : dziewi\\\\u0105tej rano] w...\", \"worker_id\": \"\\\"9\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"21\\\", \\\"11\\\", \\\"5\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "pt-PT": {"config_name": "pt-PT", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"pt-PT\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"acorda-me \\\\u00e0s nove da manh\\\\u00e3 na sexta-fei...\", \"annot_utt\": \"\\\"acorda-me \\\\u00e0s [time : nove da manh\\\\u00e3] na ...\", \"worker_id\": \"\\\"14\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"localization\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"6\\\", \\\"8\\\", \\\"12\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 2]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[1, 1, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ro-RO": {"config_name": "ro-RO", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ro-RO\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"trezeste-ma vineri la noua dimineata\\\"\", \"annot_utt\": \"\\\"trezeste-ma [date : vineri] la [time : noua dimin...\", \"worker_id\": \"\\\"6\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"63\\\", \\\"10\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ru-RU": {"config_name": "ru-RU", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ru-RU\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0440\\\\u0430\\\\u0437\\\\u0431\\\\u0443\\\\u0434\\\\u0438 \\\\u043c...\", \"annot_utt\": \"\\\"\\\\u0440\\\\u0430\\\\u0437\\\\u0431\\\\u0443\\\\u0434\\\\u0438 \\\\u043c...\", \"worker_id\": \"\\\"11\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"4\\\", \\\"32\\\", \\\"8\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sl-SL": {"config_name": "sl-SL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sl-SL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"zbudi me ob devetih zjutraj v petek\\\"\", \"annot_utt\": \"\\\"zbudi me ob [time : devetih zjutraj] v [date : pe...\", \"worker_id\": \"\\\"14\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"1\\\", \\\"13\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sq-AL": {"config_name": "sq-AL", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sq-AL\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"m\\\\u00eb zgjo t\\\\u00eb premten n\\\\u00eb n\\\\u00ebnt\\\\u0...\", \"annot_utt\": \"\\\"m\\\\u00eb zgjo [date : t\\\\u00eb premten] n\\\\u00eb [ti...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"localization\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"2\\\", \\\"16\\\"]\", \"judgments.intent_score\": \"[1, 1, 2]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target|english\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sv-SE": {"config_name": "sv-SE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sv-SE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"v\\\\u00e4ck mig vid nio p\\\\u00e5 fredag\\\"\", \"annot_utt\": \"\\\"v\\\\u00e4ck mig vid [time : nio] p\\\\u00e5 [date : fr...\", \"worker_id\": \"\\\"20\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"11\\\", \\\"20\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "sw-KE": {"config_name": "sw-KE", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"sw-KE\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"niamshe saa tatu asubuhi ijumaa\\\"\", \"annot_utt\": \"\\\"niamshe [time : saa tatu asubuhi] [date : ijumaa]...\", \"worker_id\": \"\\\"59\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"33\\\", \\\"1\\\", \\\"52\\\"]\", \"judgments.intent_score\": \"[1, 1, 2]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ta-IN": {"config_name": "ta-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ta-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0bb5\\\\u0bc6\\\\u0bb3\\\\u0bcd\\\\u0bb3\\\\u0bbf\\\\u0b95\\\\u0bcd\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u0bb5\\\\u0bc6\\\\u0bb3\\\\u0bcd\\\\u0bb3\\\\u0bbf\\\\u0b9...\", \"worker_id\": \"\\\"23\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"23\\\", \\\"17\\\", \\\"13\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "te-IN": {"config_name": "te-IN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"te-IN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0c36\\\\u0c41\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c3e\\\\u0c30\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u0c36\\\\u0c41\\\\u0c15\\\\u0c4d\\\\u0c30\\\\u0c35\\\\u0c3...\", \"worker_id\": \"\\\"21\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"2\\\", \\\"15\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 4, 4]\", \"judgments.spelling_score\": \"[1, 2, 1]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "th-TH": {"config_name": "th-TH", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"th-TH\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0e1b\\\\u0e25\\\\u0e38\\\\u0e01\\\\u0e09\\\\u0e31\\\\u0e19 \\\\u0e15...\", \"annot_utt\": \"\\\"\\\\u0e1b\\\\u0e25\\\\u0e38\\\\u0e01\\\\u0e09\\\\u0e31\\\\u0e19 \\\\u0e15...\", \"worker_id\": \"\\\"24\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"24\\\", \\\"35\\\", \\\"0\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "tl-PH": {"config_name": "tl-PH", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"tl-PH\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"gisingin mo ako ng alas nuwebe ng umaga sa biyern...\", \"annot_utt\": \"\\\"gisingin mo ako ng [time : alas nuwebe ng umaga] ...\", \"worker_id\": \"\\\"17\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"19\\\", \\\"6\\\", \\\"1\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "tr-TR": {"config_name": "tr-TR", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"tr-TR\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"beni cuma g\\\\u00fcn\\\\u00fc sabah dokuzda uyand\\\\u013...\", \"annot_utt\": \"\\\"beni [date : cuma] g\\\\u00fcn\\\\u00fc [time : sabah d...\", \"worker_id\": \"\\\"12\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"2\\\", \\\"4\\\", \\\"9\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "ur-PK": {"config_name": "ur-PK", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"ur-PK\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0645\\\\u062c\\\\u06be\\\\u06d2 \\\\u062c\\\\u0645\\\\u0639\\\\u06c1...\", \"annot_utt\": \"\\\"\\\\u0645\\\\u062c\\\\u06be\\\\u06d2 [date : \\\\u062c\\\\u0645\\\\u06...\", \"worker_id\": \"\\\"13\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"9\\\", \\\"13\\\", \\\"10\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "vi-VN": {"config_name": "vi-VN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"vi-VN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"g\\\\u1ecdi t\\\\u00f4i d\\\\u1eady l\\\\u00fac ch\\\\u00edn gi\\\\...\", \"annot_utt\": \"\\\"g\\\\u1ecdi t\\\\u00f4i d\\\\u1eady l\\\\u00fac [time : ch\\\\u0...\", \"worker_id\": \"\\\"36\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"3\\\", \\\"36\\\", \\\"37\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "zh-CN": {"config_name": "zh-CN", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"zh-CN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u661f\\\\u671f\\\\u4e94\\\\u65e9\\\\u4e0a\\\\u4e5d\\\\u70b9\\\\u53eb\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u661f\\\\u671f\\\\u4e94] \\\\u65e9\\\\u4e0a [time : ...\", \"worker_id\": \"\\\"5\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"36\\\", \\\"4\\\", \\\"12\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 3, 4]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "zh-TW": {"config_name": "zh-TW", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"zh-TW\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u79ae\\\\u62dc\\\\u4e94\\\\u65e9\\\\u4e0a\\\\u4e5d\\\\u9ede\\\\u53eb\\\\...\", \"annot_utt\": \"\\\"[date : \\\\u79ae\\\\u62dc\\\\u4e94] [time : \\\\u65e9\\\\u4e0a\\\\...\", \"worker_id\": \"\\\"49\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"6\\\", \\\"17\\\", \\\"44\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[4, 4, 3]\", \"judgments.spelling_score\": \"[2, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "all": {"config_name": "all", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"mn-MN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u0430\\\\u0434 ...\", \"annot_utt\": \"\\\"[date : \\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u043...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"4\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 3, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}, "all_1.1": {"config_name": "all_1.1", "sample_row": "{\"id\": \"\\\"1\\\"\", \"locale\": \"\\\"mn-MN\\\"\", \"partition\": \"\\\"train\\\"\", \"scenario\": \"16\", \"intent\": \"48\", \"utt\": \"\\\"\\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u0430\\\\u0434 ...\", \"annot_utt\": \"\\\"[date : \\\\u0442\\\\u0430\\\\u0432\\\\u0434\\\\u0430\\\\u0445\\\\u043...\", \"worker_id\": \"\\\"16\\\"\", \"slot_method.slot\": \"[\\\"time\\\", \\\"date\\\"]\", \"slot_method.method\": \"[\\\"translation\\\", \\\"translation\\\"]\", \"judgments.worker_id\": \"[\\\"5\\\", \\\"4\\\", \\\"2\\\"]\", \"judgments.intent_score\": \"[1, 1, 1]\", \"judgments.slots_score\": \"[1, 1, 1]\", \"judgments.grammar_score\": \"[3, 3, 4]\", \"judgments.spelling_score\": \"[1, 2, 2]\", \"judgments.language_identification\": \"[\\\"target\\\", \\\"target\\\", \\\"target\\\"]\"}", "columns": ["id", "locale", "partition", "scenario", "intent", "utt", "annot_utt", "worker_id", "slot_method_slot", "slot_method_method", "judgments_worker_id", "judgments_intent_score", "judgments_slots_score", "judgments_grammar_score", "judgments_spelling_score", "judgments_language_identification"], "columns_mapping": {"id": "id", "locale": "locale", "partition": "partition", "scenario": "scenario", "intent": "intent", "utt": "utt", "annot_utt": "annot_utt", "worker_id": "worker_id", "slot_method.slot": "slot_method_slot", "slot_method.method": "slot_method_method", "judgments.worker_id": "judgments_worker_id", "judgments.intent_score": "judgments_intent_score", "judgments.slots_score": "judgments_slots_score", "judgments.grammar_score": "judgments_grammar_score", "judgments.spelling_score": "judgments_spelling_score", "judgments.language_identification": "judgments_language_identification"}, "dataset_description": " MASSIVE is a parallel dataset of > 1M utterances across 51 languages with annotations\n for the Natural Language Understanding tasks of intent prediction and slot annotation.\n Utterances span 60 intents and include 55 slot types. MASSIVE was created by localizing\n the SLURP dataset, composed of general Intelligent Voice Assistant single-shot interactions.\n", "dataset_name": "AmazonScience/massive"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:af-ZA", "multilinguality:am-ET", "multilinguality:ar-SA", "multilinguality:az-AZ", "multilinguality:bn-BD", "multilinguality:ca-ES", "multilinguality:cy-GB", "multilinguality:da-DK", "multilinguality:de-DE", "multilinguality:el-GR", "multilinguality:en-US", "multilinguality:es-ES", "multilinguality:fa-IR", "multilinguality:fi-FI", "multilinguality:fr-FR", "multilinguality:he-IL", "multilinguality:hi-IN", "multilinguality:hu-HU", "multilinguality:hy-AM", "multilinguality:id-ID", "multilinguality:is-IS", "multilinguality:it-IT", "multilinguality:ja-JP", "multilinguality:jv-ID", "multilinguality:ka-GE", "multilinguality:km-KH", "multilinguality:kn-IN", "multilinguality:ko-KR", "multilinguality:lv-LV", "multilinguality:ml-IN", "multilinguality:mn-MN", "multilinguality:ms-MY", "multilinguality:my-MM", "multilinguality:nb-NO", "multilinguality:nl-NL", "multilinguality:pl-PL", "multilinguality:pt-PT", "multilinguality:ro-RO", "multilinguality:ru-RU", "multilinguality:sl-SL", "multilinguality:sq-AL", "multilinguality:sv-SE", "multilinguality:sw-KE", "multilinguality:ta-IN", "multilinguality:te-IN", "multilinguality:th-TH", "multilinguality:tl-PH", "multilinguality:tr-TR", "multilinguality:ur-PK", "multilinguality:vi-VN", "multilinguality:zh-CN", "multilinguality:zh-TW", "source_datasets:original", "natural-language-understanding"], "is_gated": false}, "aps/dynahate": {"dataset_name": "aps/dynahate", "description": "We present a human-and-model-in-the-loop process for dynamically generating datasets and training better performing and more robust hate detection models. We provide a new dataset of ~40,000 entries, generated and labelled by trained annotators over four rounds of dynamic data creation. It includes ~15,000 challenging perturbations and each hateful entry has fine-grained labels for the type and target of hate. Hateful entries make up 54% of the dataset, which is substantially higher than comparable datasets. We show that model performance is substantially improved using this approach. Models trained on later rounds of data collection perform better on test sets and are harder for annotators to trick. They also perform better on HATECHECK, a suite of functional tests for online hate detection. See https://arxiv.org/abs/2012.15761 for more details.", "downloads": 122, "configs": {"0.2.3": {"config_name": "0.2.3", "sample_row": "{\"acl.id\": \"\\\"acl1\\\"\", \"label\": \"1\", \"text\": \"\\\"dalits are lowlives\\\"\", \"X1\": \"1\", \"type\": \"\\\"notgiven\\\"\", \"target\": \"\\\"notgiven\\\"\", \"level\": \"\\\"original\\\"\", \"split\": \"\\\"train\\\"\", \"round.base\": \"1\", \"annotator\": \"\\\"GNZuCtwed3\\\"\", \"round\": \"\\\"1\\\"\", \"acl.id.matched\": \"\\\"NA\\\"\"}", "columns": ["acl_id", "label", "text", "X1", "type", "target", "level", "split", "round_base", "annotator", "round", "acl_id_matched"], "columns_mapping": {"acl.id": "acl_id", "label": "label", "text": "text", "X1": "X1", "type": "type", "target": "target", "level": "level", "split": "split", "round.base": "round_base", "annotator": "annotator", "round": "round", "acl.id.matched": "acl_id_matched"}, "dataset_description": "We present a human-and-model-in-the-loop process for dynamically generating datasets and training better performing and more robust hate detection models. We provide a new dataset of ~40,000 entries, generated and labelled by trained annotators over four rounds of dynamic data creation. It includes ~15,000 challenging perturbations and each hateful entry has fine-grained labels for the type and target of hate. Hateful entries make up 54% of the dataset, which is substantially higher than comparable datasets. We show that model performance is substantially improved using this approach. Models trained on later rounds of data collection perform better on test sets and are harder for annotators to trick. They also perform better on HATECHECK, a suite of functional tests for online hate detection. See https://arxiv.org/abs/2012.15761 for more details.\n", "dataset_name": "aps/dynahate"}, "0.2.2": {"config_name": "0.2.2", "sample_row": "{\"acl.id\": \"\\\"acl1\\\"\", \"label\": \"1\", \"text\": \"\\\"dalits are lowlives\\\"\", \"X1\": \"1\", \"type\": \"\\\"notgiven\\\"\", \"target\": \"\\\"notgiven\\\"\", \"level\": \"\\\"original\\\"\", \"split\": \"\\\"train\\\"\", \"round.base\": \"1\", \"annotator\": \"\\\"GNZuCtwed3\\\"\", \"round\": \"\\\"1\\\"\", \"acl.id.matched\": \"\\\"NA\\\"\"}", "columns": ["acl_id", "label", "text", "X1", "type", "target", "level", "split", "round_base", "annotator", "round", "acl_id_matched"], "columns_mapping": {"acl.id": "acl_id", "label": "label", "text": "text", "X1": "X1", "type": "type", "target": "target", "level": "level", "split": "split", "round.base": "round_base", "annotator": "annotator", "round": "round", "acl.id.matched": "acl_id_matched"}, "dataset_description": "We present a human-and-model-in-the-loop process for dynamically generating datasets and training better performing and more robust hate detection models. We provide a new dataset of ~40,000 entries, generated and labelled by trained annotators over four rounds of dynamic data creation. It includes ~15,000 challenging perturbations and each hateful entry has fine-grained labels for the type and target of hate. Hateful entries make up 54% of the dataset, which is substantially higher than comparable datasets. We show that model performance is substantially improved using this approach. Models trained on later rounds of data collection perform better on test sets and are harder for annotators to trick. They also perform better on HATECHECK, a suite of functional tests for online hate detection. See https://arxiv.org/abs/2012.15761 for more details.\n", "dataset_name": "aps/dynahate"}}, "tags": [], "is_gated": false}, "Filippo/osdg_cd": {"dataset_name": "Filippo/osdg_cd", "description": "The OSDG Community Dataset (OSDG-CD) is a public dataset of thousands of text excerpts, which were validated by approximately 1,000 OSDG Community Platform (OSDG-CP) citizen scientists from over 110 countries, with respect to the Sustainable Development Goals (SDGs).", "downloads": 33, "configs": {"main_config": {"config_name": "main_config", "sample_row": "{\"doi\": \"\\\"10.6027/9789289342698-7-en\\\"\", \"text_id\": \"\\\"00021941702cd84171ff33962197ca1f\\\"\", \"text\": \"\\\"From a gender perspective, Paulgaard points out t...\", \"sdg\": \"5\", \"label\": \"4\", \"labels_negative\": \"1\", \"labels_positive\": \"8\", \"agreement\": \"0.7777777777777778\"}", "columns": ["doi", "text_id", "text", "sdg", "label", "labels_negative", "labels_positive", "agreement"], "columns_mapping": {"doi": "doi", "text_id": "text_id", "text": "text", "sdg": "sdg", "label": "label", "labels_negative": "labels_negative", "labels_positive": "labels_positive", "agreement": "agreement"}, "dataset_description": "The OSDG Community Dataset (OSDG-CD) is a public dataset of thousands of text excerpts, which were validated by approximately 1,000 OSDG Community Platform (OSDG-CP) citizen scientists from over 110 countries, with respect to the Sustainable Development Goals (SDGs).\n", "dataset_name": "Filippo/osdg_cd"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en"], "is_gated": false}, "google/wit": {"dataset_name": "google/wit", "description": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset.\nWIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages.\nIts size enables WIT to be used as a pretraining dataset for multimodal machine learning models.", "downloads": 30, "configs": {"default": {"config_name": "default", "sample_row": "{\"language\": \"\\\"en\\\"\", \"page_url\": \"\\\"https://en.wikipedia.org/wiki/Oxydactylus\\\"\", \"image_url\": \"\\\"https://upload.wikimedia.org/wikipedia/commons/5/...\", \"page_title\": \"\\\"Oxydactylus\\\"\", \"section_title\": \"null\", \"hierarchical_section_title\": \"\\\"Oxydactylus\\\"\", \"caption_reference_description\": \"null\", \"caption_attribution_description\": \"\\\"English: Mounted skeleton of Oxydactylus longipes...\", \"caption_alt_text_description\": \"null\", \"mime_type\": \"\\\"image/jpeg\\\"\", \"original_height\": \"3564\", \"original_width\": \"2748\", \"is_main_image\": \"true\", \"attribution_passes_lang_id\": \"true\", \"page_changed_recently\": \"true\", \"context_page_description\": \"\\\"Oxydactylus is an extinct genus of camelid endemi...\", \"context_section_description\": \"\\\"Oxydactylus is an extinct genus of camelid endemi...\"}", "columns": ["language", "page_url", "image_url", "page_title", "section_title", "hierarchical_section_title", "caption_reference_description", "caption_attribution_description", "caption_alt_text_description", "mime_type", "original_height", "original_width", "is_main_image", "attribution_passes_lang_id", "page_changed_recently", "context_page_description", "context_section_description"], "columns_mapping": {"language": "language", "page_url": "page_url", "image_url": "image_url", "page_title": "page_title", "section_title": "section_title", "hierarchical_section_title": "hierarchical_section_title", "caption_reference_description": "caption_reference_description", "caption_attribution_description": "caption_attribution_description", "caption_alt_text_description": "caption_alt_text_description", "mime_type": "mime_type", "original_height": "original_height", "original_width": "original_width", "is_main_image": "is_main_image", "attribution_passes_lang_id": "attribution_passes_lang_id", "page_changed_recently": "page_changed_recently", "context_page_description": "context_page_description", "context_section_description": "context_section_description"}, "dataset_description": "Wikipedia-based Image Text (WIT) Dataset is a large multimodal multilingual dataset.\nWIT is composed of a curated set of 37.6 million entity rich image-text examples with 11.5 million unique images across 108 Wikipedia languages.\nIts size enables WIT to be used as a pretraining dataset for multimodal machine learning models.\n", "dataset_name": "google/wit"}}, "tags": ["task_categories:text-retrieval", "task_categories:image-to-text", "task_ids:image-captioning", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:original", "source_datasets:extended|wikipedia", "language:af", "language:ar", "language:ast", "language:azb", "language:be", "language:bg", "language:bn", "language:br", "language:ca", "language:cs", "language:cy", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:fi", "language:fr", "language:fy", "language:ga", "language:gl", "language:hr", "language:hu", "language:hy", "language:id", "language:it", "language:iw", "language:ja", "language:ka", "language:ko", "language:la", "language:lt", "language:lv", "language:mk", "language:ml", "language:ms", "language:nl", "language:nn", "language:no", "language:pl", "language:pt", "language:ro", "language:ru", "language:sk", "language:sl", "language:sr", "language:sv", "language:th", "language:tr", "language:uk", "language:ur", "language:vi", "language:vo", "language:zh"], "is_gated": false}, "arbml/masader": {"dataset_name": "arbml/masader", "description": "Masader is the largest public catalogue for Arabic NLP datasets, which consists of more than 200 datasets annotated with 25 attributes.", "downloads": 13, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"Name\": \"\\\"Shami\\\"\", \"Subsets\": \"[{\\\"Name\\\": \\\"Jordanian\\\", \\\"Dialect\\\": \\\"ar-JO: (Arabic ...\", \"HF Link\": \"\\\"https://huggingface.co/datasets/arbml/Shami\\\"\", \"Link\": \"\\\"https://github.com/GU-CLASP/shami-corpus\\\"\", \"License\": \"\\\"Apache-2.0\\\"\", \"Year\": \"2018\", \"Language\": \"\\\"ar\\\"\", \"Dialect\": \"\\\"ar-LEV: (Arabic(Levant))\\\"\", \"Domain\": \"\\\"social media\\\"\", \"Form\": \"\\\"text\\\"\", \"Collection Style\": \"\\\"crawling and annotation(other)\\\"\", \"Description\": \"\\\"the first Levantine Dialect Corpus (SDC) covering...\", \"Volume\": \"\\\"117,805\\\"\", \"Unit\": \"\\\"sentences\\\"\", \"Ethical Risks\": \"\\\"Medium\\\"\", \"Provider\": \"\\\"Multiple institutions \\\"\", \"Derived From\": \"\\\"nan\\\"\", \"Paper Title\": \"\\\"Shami: A Corpus of Levantine Arabic Dialects\\\"\", \"Paper Link\": \"\\\"https://aclanthology.org/L18-1576.pdf\\\"\", \"Script\": \"\\\"Arab\\\"\", \"Tokenized\": \"\\\"No\\\"\", \"Host\": \"\\\"GitHub\\\"\", \"Access\": \"\\\"Free\\\"\", \"Cost\": \"\\\"nan\\\"\", \"Test Split\": \"\\\"No\\\"\", \"Tasks\": \"\\\"dialect identification\\\"\", \"Venue Title\": \"\\\"LREC\\\"\", \"Citations\": \"\\\"25.0\\\"\", \"Venue Type\": \"\\\"conference\\\"\", \"Venue Name\": \"\\\"International Conference on Language Resources an...\", \"Authors\": \"\\\"Chatrine Qwaider,Motaz Saad,S. Chatzikyriakidis,S...\", \"Affiliations\": \"\\\",The Islamic University of Gaza,,\\\"\", \"Abstract\": \"\\\"Modern Standard Arabic (MSA) is the official lang...\", \"Added By\": \"\\\"nan\\\"\"}", "columns": ["Name", "Subsets", "HF Link", "Link", "License", "Year", "Language", "Dialect", "Domain", "Form", "Collection Style", "Description", "Volume", "Unit", "Ethical Risks", "Provider", "Derived From", "Paper Title", "Paper Link", "Script", "Tokenized", "Host", "Access", "Cost", "Test Split", "Tasks", "Venue Title", "Citations", "Venue Type", "Venue Name", "Authors", "Affiliations", "Abstract", "Added By"], "columns_mapping": {"Name": "Name", "Subsets": "Subsets", "HF Link": "HF Link", "Link": "Link", "License": "License", "Year": "Year", "Language": "Language", "Dialect": "Dialect", "Domain": "Domain", "Form": "Form", "Collection Style": "Collection Style", "Description": "Description", "Volume": "Volume", "Unit": "Unit", "Ethical Risks": "Ethical Risks", "Provider": "Provider", "Derived From": "Derived From", "Paper Title": "Paper Title", "Paper Link": "Paper Link", "Script": "Script", "Tokenized": "Tokenized", "Host": "Host", "Access": "Access", "Cost": "Cost", "Test Split": "Test Split", "Tasks": "Tasks", "Venue Title": "Venue Title", "Citations": "Citations", "Venue Type": "Venue Type", "Venue Name": "Venue Name", "Authors": "Authors", "Affiliations": "Affiliations", "Abstract": "Abstract", "Added By": "Added By"}, "dataset_description": "Masader is the largest public catalogue for Arabic NLP datasets, which consists of more than 200 datasets annotated with 25 attributes. \n", "dataset_name": "arbml/masader"}}, "tags": [], "is_gated": false}, "searle-j/kote": {"dataset_name": "searle-j/kote", "description": "50k Korean online comments labeled for 44 emotion categories.", "downloads": 42, "configs": {"dichotomized": {"config_name": "dichotomized", "sample_row": "{\"ID\": \"\\\"39087\\\"\", \"text\": \"\\\"\\\\ub0b4\\\\uac00 \\\\ud1b0\\\\ud589\\\\ud06c\\\\uc2a4\\\\ub97c \\\\uc88...\", \"labels\": \"[2, 13, 15, 16, 29, 39]\"}", "columns": ["ID", "text", "labels"], "columns_mapping": {"ID": "ID", "text": "text", "labels": "labels"}, "dataset_description": "50k Korean online comments labeled for 44 emotion categories.\n", "dataset_name": "searle-j/kote"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:kor"], "is_gated": false}, "taln-ls2n/kptimes": {"dataset_name": "taln-ls2n/kptimes", "description": "KPTimes benchmark dataset for keyphrase extraction an generation.", "downloads": 13, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"id\": \"\\\"ny0282969\\\"\", \"title\": \"\\\"For Donald Trump\\\\u2019s Big Speech, an Added Pres...\", \"abstract\": \"\\\"CLEVELAND \\\\u2014 Until Monday night, Donald J. Tr...\", \"keyphrases\": \"[\\\"Donald Trump\\\", \\\"Speeches\\\", \\\"Plagiarism\\\", \\\"Melani...\", \"prmu\": \"[\\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"P\\\", \\\"R\\\", \\\"M\\\"]\", \"date\": \"\\\"2016/07/21\\\"\", \"categories\": \"[\\\"us\\\", \\\"politics\\\"]\"}", "columns": ["id", "title", "abstract", "keyphrases", "prmu", "date", "categories"], "columns_mapping": {"id": "id", "title": "title", "abstract": "abstract", "keyphrases": "keyphrases", "prmu": "prmu", "date": "date", "categories": "categories"}, "dataset_description": "KPTimes benchmark dataset for keyphrase extraction an generation.\n", "dataset_name": "taln-ls2n/kptimes"}}, "tags": ["task_categories:text-generation", "annotations_creators:unknown", "multilinguality:monolingual", "language:en"], "is_gated": false}, "strombergnlp/rustance": {"dataset_name": "strombergnlp/rustance", "description": "This is a stance prediction dataset in Russian. The dataset contains comments on news articles,\nand rows are a comment, the title of the news article it responds to, and the stance of the comment\ntowards the article.", "downloads": 15, "configs": {"rustance": {"config_name": "rustance", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u0412\\\\u043e\\\\u043b\\\\u043a\\\\u0438, \\\\u0432\\\\u043e\\\\u043...\", \"title\": \"\\\"\\\\u041c\\\\u0438\\\\u043d\\\\u043e\\\\u0431\\\\u043e\\\\u0440\\\\u043e\\\\...\", \"stance\": \"3\"}", "columns": ["id", "text", "title", "stance"], "columns_mapping": {"id": "id", "text": "text", "title": "title", "stance": "stance"}, "dataset_description": "This is a stance prediction dataset in Russian. The dataset contains comments on news articles,\nand rows are a comment, the title of the news article it responds to, and the stance of the comment\ntowards the article.\n", "dataset_name": "strombergnlp/rustance"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ru", "stance-detection"], "is_gated": false}, "ccdv/WCEP-10": {"dataset_name": "ccdv/WCEP-10", "description": "WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"", "downloads": 30, "configs": {"newline": {"config_name": "newline", "sample_row": "{\"document\": \"\\\"Rodrigo Duterte, the new president of the Philipp...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}, "roberta": {"config_name": "roberta", "sample_row": "{\"document\": \"\\\"Rodrigo Duterte, the new president of the Philipp...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}, "bert": {"config_name": "bert", "sample_row": "{\"document\": \"\\\"Rodrigo Duterte, the new president of the Philipp...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}, "list": {"config_name": "list", "sample_row": "{\"document\": \"[\\\"Rodrigo Duterte, the new president of the Philip...\", \"summary\": \"\\\"The death toll in Rodrigo Duterte's war on drugs ...\"}", "columns": ["document", "summary"], "columns_mapping": {"document": "document", "summary": "summary"}, "dataset_description": "\n WCEP10 dataset for summarization.\n From paper: \"A Large-Scale Multi-Document Summarization Dataset from the Wikipedia\n Current Events Portal\" by D. Gholipour et al.\"\n From paper: \"PRIMER: Pyramid-based Masked Sentence Pre-training for Multi-document\n Summarization\" by W. Xiao et al.\"\n\n", "dataset_name": "ccdv/WCEP-10"}}, "tags": ["task_categories:summarization", "task_categories:text2text-generation", "multilinguality:monolingual", "language:en", "conditional-text-generation"], "is_gated": false}, "strombergnlp/nordic_langid": {"dataset_name": "strombergnlp/nordic_langid", "description": "Automatic language identification is a challenging problem. Discriminating\nbetween closely related languages is especially difficult. This paper presents\na machine learning approach for automatic language identification for the\nNordic languages, which often suffer miscategorisation by existing \nstate-of-the-art tools. Concretely we will focus on discrimination between six \nNordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), \nFaroese and Icelandic.\n\nThis is the data for the tasks. Two variants are provided: 10K and 50K, with\nholding 10,000 and 50,000 examples for each language respectively.", "downloads": 101, "configs": {"10k": {"config_name": "10k", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\"den ble gitt charter av ranulf de blondeville\\\"\", \"language\": \"2\"}", "columns": ["id", "sentence", "language"], "columns_mapping": {"id": "id", "sentence": "sentence", "language": "language"}, "dataset_description": "Automatic language identification is a challenging problem. Discriminating\nbetween closely related languages is especially difficult. This paper presents\na machine learning approach for automatic language identification for the\nNordic languages, which often suffer miscategorisation by existing \nstate-of-the-art tools. Concretely we will focus on discrimination between six \nNordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), \nFaroese and Icelandic.\n\nThis is the data for the tasks. Two variants are provided: 10K and 50K, with\nholding 10,000 and 50,000 examples for each language respectively.\n\n", "dataset_name": "strombergnlp/nordic_langid"}, "50k": {"config_name": "50k", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\"jackson er anerkjent som den mest suksessrike art...\", \"language\": \"2\"}", "columns": ["id", "sentence", "language"], "columns_mapping": {"id": "id", "sentence": "sentence", "language": "language"}, "dataset_description": "Automatic language identification is a challenging problem. Discriminating\nbetween closely related languages is especially difficult. This paper presents\na machine learning approach for automatic language identification for the\nNordic languages, which often suffer miscategorisation by existing \nstate-of-the-art tools. Concretely we will focus on discrimination between six \nNordic languages: Danish, Swedish, Norwegian (Nynorsk), Norwegian (Bokm\u00e5l), \nFaroese and Icelandic.\n\nThis is the data for the tasks. Two variants are provided: 10K and 50K, with\nholding 10,000 and 50,000 examples for each language respectively.\n\n", "dataset_name": "strombergnlp/nordic_langid"}}, "tags": ["task_categories:text-classification", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:da", "language:nn", "language:nb", "language:fo", "language:is", "language:sv", "language-identification"], "is_gated": false}, "strombergnlp/bornholmsk_parallel": {"dataset_name": "strombergnlp/bornholmsk_parallel", "description": "This dataset is parallel text for Bornholmsk and Danish. \n\nFor more details, see the paper [Bornholmsk Natural Language Processing: Resources and Tools](https://aclanthology.org/W19-6138/).", "downloads": 116, "configs": {"BornholmskParallel": {"config_name": "BornholmskParallel", "sample_row": "{\"id\": \"\\\"0\\\"\", \"da_bornholm\": \"\\\"Hanj va ful \\\\u00e5 allera\\\"\", \"da\": \"\\\"Han var fuld af beundring\\\"\"}", "columns": ["id", "da_bornholm", "da"], "columns_mapping": {"id": "id", "da_bornholm": "da_bornholm", "da": "da"}, "dataset_description": "This dataset is parallel text for Bornholmsk and Danish. \n\nFor more details, see the paper [Bornholmsk Natural Language Processing: Resources and Tools](https://aclanthology.org/W19-6138/).\n", "dataset_name": "strombergnlp/bornholmsk_parallel"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original"], "is_gated": false}, "lmqg/qg_subjqa": {"dataset_name": "lmqg/qg_subjqa", "description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "downloads": 28, "configs": {"all": {"config_name": "all", "sample_row": "{\"answer\": \"\\\"any book that takes me 3 months and 20 different ...\", \"paragraph_question\": \"\\\"question: How is book?, context: I am giving \\\\\\\"Go...\", \"question\": \"\\\"How is book?\\\"\", \"sentence\": \"\\\"In my mind, any book that takes me 3 months and 2...\", \"paragraph\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"sentence_answer\": \"\\\"In my mind, any book that takes me 3 months ...\", \"paragraph_answer\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_sentence\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_id\": \"\\\"1b7cc3db9ec681edd253a41a2785b5a9\\\"\", \"question_subj_level\": \"2\", \"answer_subj_level\": \"2\", \"domain\": \"\\\"books\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "books": {"config_name": "books", "sample_row": "{\"answer\": \"\\\"any book that takes me 3 months and 20 different ...\", \"paragraph_question\": \"\\\"question: How is book?, context: I am giving \\\\\\\"Go...\", \"question\": \"\\\"How is book?\\\"\", \"sentence\": \"\\\"In my mind, any book that takes me 3 months and 2...\", \"paragraph\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"sentence_answer\": \"\\\"In my mind, any book that takes me 3 months ...\", \"paragraph_answer\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_sentence\": \"\\\"I am giving \\\\\\\"Gone Girl\\\\\\\" 3 stars, but only begru...\", \"paragraph_id\": \"\\\"1b7cc3db9ec681edd253a41a2785b5a9\\\"\", \"question_subj_level\": \"2\", \"answer_subj_level\": \"2\", \"domain\": \"\\\"books\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "electronics": {"config_name": "electronics", "sample_row": "{\"answer\": \"\\\"the keyboard and its difficult to adjust the arm ...\", \"paragraph_question\": \"\\\"question: How would you describe the keyboard?, c...\", \"question\": \"\\\"How would you describe the keyboard?\\\"\", \"sentence\": \"\\\"First, when you try and open it you have to fight...\", \"paragraph\": \"\\\"The concept was good, the execution was terrible....\", \"sentence_answer\": \"\\\"First, when you try and open it you have to fight...\", \"paragraph_answer\": \"\\\"The concept was good, the execution was terrible....\", \"paragraph_sentence\": \"\\\"The concept was good, the execution was terrible....\", \"paragraph_id\": \"\\\"8d0cdd656a9e45b9acf198638711c4f6\\\"\", \"question_subj_level\": \"2\", \"answer_subj_level\": \"2\", \"domain\": \"\\\"electronics\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "grocery": {"config_name": "grocery", "sample_row": "{\"answer\": \"\\\"I love a deep, bold coffee but don't like acidic ...\", \"paragraph_question\": \"\\\"question: How do you like the coffee?, context: I...\", \"question\": \"\\\"How do you like the coffee?\\\"\", \"sentence\": \"\\\"I love a deep, bold coffee but don't like acidic ...\", \"paragraph\": \"\\\"I usually like to grind my own beans but tried th...\", \"sentence_answer\": \"\\\" I love a deep, bold coffee but don't like ac...\", \"paragraph_answer\": \"\\\"I usually like to grind my own beans but tried th...\", \"paragraph_sentence\": \"\\\"I usually like to grind my own beans but tried th...\", \"paragraph_id\": \"\\\"bf7314a2f905b2b72c358bfe556200f4\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"grocery\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "movies": {"config_name": "movies", "sample_row": "{\"answer\": \"\\\"when this movie first came out\\\"\", \"paragraph_question\": \"\\\"question: Is this movie recommended?, context: To...\", \"question\": \"\\\"Is this movie recommended?\\\"\", \"sentence\": \"\\\"To be honest, when this movie first came out , I ...\", \"paragraph\": \"\\\"To be honest, when this movie first came out, I r...\", \"sentence_answer\": \"\\\"To be honest, when this movie first came out...\", \"paragraph_answer\": \"\\\"To be honest, when this movie first came out...\", \"paragraph_sentence\": \"\\\" To be honest, when this movie first came out...\", \"paragraph_id\": \"\\\"5901dbf09ed091190bf05b54ce8d9d95\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"movies\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "restaurants": {"config_name": "restaurants", "sample_row": "{\"answer\": \"\\\"My wife's salad looked like it was fished out of ...\", \"paragraph_question\": \"\\\"question: Does this food stink others?, context: ...\", \"question\": \"\\\"Does this food stink others?\\\"\", \"sentence\": \"\\\"My wife's salad looked like it was fished out of ...\", \"paragraph\": \"\\\"We went here with our expectations quite low. Aft...\", \"sentence_answer\": \"\\\" My wife's salad looked like it was fished ou...\", \"paragraph_answer\": \"\\\"We went here with our expectations quite low. Aft...\", \"paragraph_sentence\": \"\\\"We went here with our expectations quite low. Aft...\", \"paragraph_id\": \"\\\"nQj2DGkomIWsKL6SRu8GGg\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"restaurants\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}, "tripadvisor": {"config_name": "tripadvisor", "sample_row": "{\"answer\": \"\\\"The lobby is Great\\\"\", \"paragraph_question\": \"\\\"question: How's the hotel lobby?, context: The lo...\", \"question\": \"\\\"How's the hotel lobby?\\\"\", \"sentence\": \"\\\"The lobby is Great , but it all ends there.\\\"\", \"paragraph\": \"\\\"The lobby is Great, but it all ends there. I was ...\", \"sentence_answer\": \"\\\" The lobby is Great , but it all ends th...\", \"paragraph_answer\": \"\\\" The lobby is Great , but it all ends the...\", \"paragraph_sentence\": \"\\\" The lobby is Great , but it all ends there. ...\", \"paragraph_id\": \"\\\"tripadvisor_review_3303\\\"\", \"question_subj_level\": \"1\", \"answer_subj_level\": \"1\", \"domain\": \"\\\"tripadvisor\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id", "question_subj_level", "answer_subj_level", "domain"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id", "question_subj_level": "question_subj_level", "answer_subj_level": "answer_subj_level", "domain": "domain"}, "dataset_description": "[SubjQA](https://github.com/megagonlabs/SubjQA) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_subjqa"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:subjqa", "language:en", "question-generation"], "is_gated": false}, "ncats/EpiSet4NER-v2": {"dataset_name": "ncats/EpiSet4NER-v2", "description": "**REWRITE*\nEpiSet4NER-2 is a dataset generated from 620 rare disease abstracts labeled using statistical and rule-base methods. \nFor more details see *INSERT PAPER* and https://github.com/ncats/epi4GARD/tree/master/EpiExtract4GARD#epiextract4gard", "downloads": 11, "configs": {"EpiSet4NER": {"config_name": "EpiSet4NER", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Background\\\", \\\"Chemotherapy\\\", \\\"-\\\", \\\"induced\\\", \\\"ca...\", \"ner_tags\": \"[0, 1, 2, 2, 2, 0, 3, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "**REWRITE*\nEpiSet4NER-2 is a dataset generated from 620 rare disease abstracts labeled using statistical and rule-base methods. \nFor more details see *INSERT PAPER* and https://github.com/ncats/epi4GARD/tree/master/EpiExtract4GARD#epiextract4gard\n", "dataset_name": "ncats/EpiSet4NER-v2"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "epidemiology", "rare disease", "named entity recognition", "NER", "NIH"], "is_gated": false}, "strombergnlp/rumoureval_2019": {"dataset_name": "strombergnlp/rumoureval_2019", "description": "\nStance prediction task in English. The goal is to predict whether a given reply to a claim either supports, denies, questions, or simply comments on the claim. Ran as a SemEval task in 2019.", "downloads": 10, "configs": {"RumourEval2019": {"config_name": "RumourEval2019", "sample_row": "{\"id\": \"\\\"0\\\"\", \"source_text\": \"\\\"France: 10 people dead after shooting at HQ of sa...\", \"reply_text\": \"\\\"MT @euronews France: 10 dead after shooting at HQ...\", \"label\": \"3\"}", "columns": ["id", "source_text", "reply_text", "label"], "columns_mapping": {"id": "id", "source_text": "source_text", "reply_text": "reply_text", "label": "label"}, "dataset_description": "\nStance prediction task in English. The goal is to predict whether a given reply to a claim either supports, denies, questions, or simply comments on the claim. Ran as a SemEval task in 2019.\n", "dataset_name": "strombergnlp/rumoureval_2019"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:en", "stance-detection"], "is_gated": false}, "HuggingFaceM4/webvid": {"dataset_name": "HuggingFaceM4/webvid", "description": "WebVid is a large-scale dataset of video clips with textual descriptions sourced from the web. The videos are diverse and rich in their content.", "downloads": 986, "configs": {"2M": {"config_name": "2M", "sample_row": "{\"videoid\": \"31353427\", \"name\": \"\\\"Merida, mexico - may 23, 2017: tourists are walki...\", \"page_dir\": \"\\\"016401_016450\\\"\", \"duration\": \"15\", \"contentUrl\": \"\\\"https://ak.picdn.net/shutterstock/videos/31353427...\"}", "columns": ["videoid", "dataset_name", "page_dir", "duration", "contentUrl"], "columns_mapping": {"videoid": "videoid", "dataset_name": "dataset_name", "page_dir": "page_dir", "duration": "duration", "contentUrl": "contentUrl"}, "dataset_description": "WebVid is a large-scale dataset of video clips with textual descriptions sourced from the web. The videos are diverse and rich in their content.\n", "dataset_name": "HuggingFaceM4/webvid"}, "10M": {"config_name": "10M", "sample_row": "{\"videoid\": \"21179416\", \"name\": \"\\\"Aerial shot winter forest\\\"\", \"page_dir\": \"\\\"006001_006050\\\"\", \"duration\": \"11\", \"contentUrl\": \"\\\"https://ak.picdn.net/shutterstock/videos/21179416...\"}", "columns": ["videoid", "dataset_name", "page_dir", "duration", "contentUrl"], "columns_mapping": {"videoid": "videoid", "dataset_name": "dataset_name", "page_dir": "page_dir", "duration": "duration", "contentUrl": "contentUrl"}, "dataset_description": "WebVid is a large-scale dataset of video clips with textual descriptions sourced from the web. The videos are diverse and rich in their content.\n", "dataset_name": "HuggingFaceM4/webvid"}}, "tags": [], "is_gated": false}, "HuggingFaceM4/vatex": {"dataset_name": "HuggingFaceM4/vatex", "description": "VATEX is a large-scale multilingual video description dataset, which contains over 41,250 videos and 825,000 captions\nin both English and Chinese. VATEX is characterized by the following major unique properties.\nFirst, it contains both English and Chinese descriptions at scale, which can support many multilingual studies\nthat are constrained by monolingual datasets. Secondly, VATEX has a high number of clip-sentence pairs\nwith each video clip annotated with multiple unique sentences, and every caption is unique in\nthe whole corpus. Third, VATEX contains more comprehensive yet representative video content,\ncovering 600 human activities in total. Furthermore, both the English and Chinese corpora in\nVATEX are lexically richer and thus allow more natural and diverse caption generation.", "downloads": 88, "configs": {"v1.1": {"config_name": "v1.1", "sample_row": "{\"videoID\": \"\\\"Ptf_2VRj-V0\\\"\", \"path\": \"\\\"https://www.youtube.com/watch?v=Ptf_2VRj-V0\\\"\", \"start\": \"122\", \"end\": \"132\", \"enCap\": \"[\\\"People wearing harnesses using ropes to climb up...\", \"chCap\": \"[\\\"\\\\u4e00\\\\u4e2a\\\\u5e26\\\\u7740\\\\u767d\\\\u8272\\\\u5b89\\\\u5168...\"}", "columns": ["videoID", "path", "start", "end", "enCap", "chCap"], "columns_mapping": {"videoID": "videoID", "path": "path", "start": "start", "end": "end", "enCap": "enCap", "chCap": "chCap"}, "dataset_description": "VATEX is a large-scale multilingual video description dataset, which contains over 41,250 videos and 825,000 captions\nin both English and Chinese. VATEX is characterized by the following major unique properties.\nFirst, it contains both English and Chinese descriptions at scale, which can support many multilingual studies\nthat are constrained by monolingual datasets. Secondly, VATEX has a high number of clip-sentence pairs\nwith each video clip annotated with multiple unique sentences, and every caption is unique in\nthe whole corpus. Third, VATEX contains more comprehensive yet representative video content,\ncovering 600 human activities in total. Furthermore, both the English and Chinese corpora in\nVATEX are lexically richer and thus allow more natural and diverse caption generation.\n", "dataset_name": "HuggingFaceM4/vatex"}, "v1.0": {"config_name": "v1.0", "sample_row": "{\"videoID\": \"\\\"Ptf_2VRj-V0\\\"\", \"path\": \"\\\"https://www.youtube.com/watch?v=Ptf_2VRj-V0\\\"\", \"start\": \"122\", \"end\": \"132\", \"enCap\": \"[\\\"People wearing harnesses using ropes to climb up...\", \"chCap\": \"[\\\"\\\\u4e00\\\\u4e2a\\\\u5e26\\\\u7740\\\\u767d\\\\u8272\\\\u5b89\\\\u5168...\"}", "columns": ["videoID", "path", "start", "end", "enCap", "chCap"], "columns_mapping": {"videoID": "videoID", "path": "path", "start": "start", "end": "end", "enCap": "enCap", "chCap": "chCap"}, "dataset_description": "VATEX is a large-scale multilingual video description dataset, which contains over 41,250 videos and 825,000 captions\nin both English and Chinese. VATEX is characterized by the following major unique properties.\nFirst, it contains both English and Chinese descriptions at scale, which can support many multilingual studies\nthat are constrained by monolingual datasets. Secondly, VATEX has a high number of clip-sentence pairs\nwith each video clip annotated with multiple unique sentences, and every caption is unique in\nthe whole corpus. Third, VATEX contains more comprehensive yet representative video content,\ncovering 600 human activities in total. Furthermore, both the English and Chinese corpora in\nVATEX are lexically richer and thus allow more natural and diverse caption generation.\n", "dataset_name": "HuggingFaceM4/vatex"}}, "tags": [], "is_gated": false}, "mwritescode/slither-audited-smart-contracts": {"dataset_name": "mwritescode/slither-audited-smart-contracts", "description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.", "downloads": 1980, "configs": {"all-plain-text": {"config_name": "all-plain-text", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"\\\"{\\\\\\\"success\\\\\\\": true, \\\\\\\"error\\\\\\\": null, \\\\\\\"results\\\\\\\":...\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "all-multilabel": {"config_name": "all-multilabel", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"[4]\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "big-plain-text": {"config_name": "big-plain-text", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"\\\"{\\\\\\\"success\\\\\\\": true, \\\\\\\"error\\\\\\\": null, \\\\\\\"results\\\\\\\":...\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "big-multilabel": {"config_name": "big-multilabel", "sample_row": "{\"address\": \"\\\"0x006699d34AA3013605d468d2755A2Fe59A16B12B\\\"\", \"source_code\": \"\\\"pragma solidity 0.5.4;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n\\\\...\", \"bytecode\": \"\\\"0x608060405234801561001057600080fd5b5060043610610...\", \"slither\": \"[1]\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "small-plain-text": {"config_name": "small-plain-text", "sample_row": "{\"address\": \"\\\"0x01b23286ff60a543ec29366ae8d6b6274ca20541\\\"\", \"source_code\": \"\\\"pragma solidity 0.4.26;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n...\", \"bytecode\": \"\\\"0x608060405260043610610112576000357c0100000000000...\", \"slither\": \"\\\"{\\\\\\\"success\\\\\\\": true, \\\\\\\"error\\\\\\\": null, \\\\\\\"results\\\\\\\":...\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}, "small-multilabel": {"config_name": "small-multilabel", "sample_row": "{\"address\": \"\\\"0x01b23286ff60a543ec29366ae8d6b6274ca20541\\\"\", \"source_code\": \"\\\"pragma solidity 0.4.26;\\\\n\\\\ninterface IERC20 {\\\\n\\\\n...\", \"bytecode\": \"\\\"0x608060405260043610610112576000357c0100000000000...\", \"slither\": \"[6]\"}", "columns": ["address", "source_code", "bytecode", "slither"], "columns_mapping": {"address": "address", "source_code": "source_code", "bytecode": "bytecode", "slither": "slither"}, "dataset_description": "This dataset contains source code and deployed bytecode for Solidity Smart Contracts that have been verified on Etherscan.io, along with a classification of their vulnerabilities according to the Slither static analysis framework.\n", "dataset_name": "mwritescode/slither-audited-smart-contracts"}}, "tags": ["task_categories:text-classification", "task_categories:text-generation", "task_ids:multi-label-classification", "task_ids:multi-input-text-classification", "task_ids:language-modeling", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "wdc/products-2017": {"dataset_name": "wdc/products-2017", "description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.", "downloads": 265, "configs": {"computers_xlarge": {"config_name": "computers_xlarge", "sample_row": "{\"pair_id\": \"\\\"2551242#16272671\\\"\", \"label\": \"1\", \"id_left\": \"2551242\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"79457\", \"brand_left\": \"\\\"\\\\\\\"Corsair\\\\\\\"@en\\\"\", \"title_left\": \"\\\" \\\\\\\"Corsair Vengeance LPX Black 64GB (4x16GB) DDR4...\", \"description_left\": \"\\\"\\\\\\\"DDR4, 2666MHz, CL16, 1.2v, XMP 2.0, Lifetime Wa...\", \"price_left\": \"null\", \"specTableContent_left\": \"\\\" Memory Type DDR4 (PC4-21300) Capacity 64GB (4 x ...\", \"id_right\": \"16272671\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"79457\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Corsair Vengeance LPX CMK64GX4M4A2666C16 - Pri...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Categorie Geheugen intern Merk Corsair Productse...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "computers_large": {"config_name": "computers_large", "sample_row": "{\"pair_id\": \"\\\"10350670#11790323\\\"\", \"label\": \"0\", \"id_left\": \"10350670\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"95342\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"SilverStone ECM20 Adaptador PCIe a M.2\\\\\\\"@es M....\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"11790323\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"1450313\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Samsung 960 Pro 2TB - Prijzen \\\\\\\"@NL Tweakers\\\\\\\"...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Categorie Solid state drives Merk Samsung Produc...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "computers_medium": {"config_name": "computers_medium", "sample_row": "{\"pair_id\": \"\\\"14219585#11723285\\\"\", \"label\": \"0\", \"id_left\": \"14219585\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"521249\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Apple - Mac Pro Desktop Computer 6-Core Intel\\\\...\", \"description_left\": \"\\\"\\\\\\\"Apple Mac Pro MD878LL/A Desktop: Designed for p...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"11723285\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"9835048\", \"brand_right\": \"\\\"\\\\\\\"HP Enterprise\\\\\\\"\\\"\", \"title_right\": \"\\\" \\\\\\\"COMPAQ PL ML530R G3 Xeon 3.0GHz 1GB\\\\\\\", \\\\\\\"Null\\\\...\", \"description_right\": \"\\\"\\\\\\\"Description:\\\\n271246-001 Proliant Xeon 3.0GHz ...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "computers_small": {"config_name": "computers_small", "sample_row": "{\"pair_id\": \"\\\"15745640#14832469\\\"\", \"label\": \"0\", \"id_left\": \"15745640\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"914299\", \"brand_left\": \"\\\"\\\\\\\"HP Enterprise\\\\\\\"\\\"\", \"title_left\": \"\\\" \\\\\\\"631674-B21 HP Smart Array P421/2GB Controller\\\\...\", \"description_left\": \"\\\" \\\\\\\"Description:HP Smart Array P421/2GB FBWC 6Gb2-...\", \"price_left\": \"\\\"\\\\\\\"CAD\\\\\\\", \\\\\\\"$605.74 CAD\\\\\\\"\\\"\", \"specTableContent_left\": \"\\\" Specifications: Category Proliant Controller Sub...\", \"id_right\": \"14832469\", \"category_right\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_right\": \"4293688\", \"brand_right\": \"\\\"\\\\\\\"HP Enterprise\\\\\\\"\\\"\", \"title_right\": \"\\\" \\\\\\\"300680-B21 HP 2GB (2x1GB) 266MHz SDRAM Kit\\\\\\\", ...\", \"description_right\": \"\\\"\\\\\\\"Description:Genuine HPE 2GB (2x1GB) Registered ...\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" 300680-B21\\\\u00a0Compatible Servers: BL20p G2 BL3...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_xlarge": {"config_name": "cameras_xlarge", "sample_row": "{\"pair_id\": \"\\\"11933246#14836018\\\"\", \"label\": \"0\", \"id_left\": \"11933246\", \"category_left\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_left\": \"1041874\", \"brand_left\": \"\\\"\\\\\\\"Canon\\\\\\\"@en\\\"\", \"title_left\": \"\\\" \\\\\\\"Canon EOS 5D Mark IV DSLR Camera with 24-105mm...\", \"description_left\": \"\\\"\\\\\\\"\\\\n30.4MP Full-Frame CMOS Sensor\\\\nEF 24-105mm f/...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"14836018\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"197207\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\" C\\\\u00e1mara CANON EOS 6D + Lente EF 24-105L \\\\...\", \"description_right\": \"\\\"\\\\\\\"\\\\n C\\\\u00e1mara CANON...\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Marca CANON Megapixeles 20.2 MP TAMA\\\\u00d1O DE P...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_large": {"config_name": "cameras_large", "sample_row": "{\"pair_id\": \"\\\"16965715#5931545\\\"\", \"label\": \"1\", \"id_left\": \"16965715\", \"category_left\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_left\": \"9309675\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Veho VCC-005 MUVI HD NPNG Body Camera/Action C...\", \"description_left\": \"\\\"\\\\\\\"\\\\n\\\\tHD video at 30fps & Up to 8MP Stills\\\\n\\\\t170...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"5931545\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"9309675\", \"brand_right\": \"\\\"\\\\\\\"Veho\\\\\\\"@en-US\\\"\", \"title_right\": \"\\\" \\\\\\\"Veho VCC-005-MUVI-NPNG MUVI HD Mini Handsfree ...\", \"description_right\": \"\\\"\\\\\\\"Veho are pleased to announce the partnership wi...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_medium": {"config_name": "cameras_medium", "sample_row": "{\"pair_id\": \"\\\"16965715#5931545\\\"\", \"label\": \"1\", \"id_left\": \"16965715\", \"category_left\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_left\": \"9309675\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Veho VCC-005 MUVI HD NPNG Body Camera/Action C...\", \"description_left\": \"\\\"\\\\\\\"\\\\n\\\\tHD video at 30fps & Up to 8MP Stills\\\\n\\\\t170...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"5931545\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"9309675\", \"brand_right\": \"\\\"\\\\\\\"Veho\\\\\\\"@en-US\\\"\", \"title_right\": \"\\\" \\\\\\\"Veho VCC-005-MUVI-NPNG MUVI HD Mini Handsfree ...\", \"description_right\": \"\\\"\\\\\\\"Veho are pleased to announce the partnership wi...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "cameras_small": {"config_name": "cameras_small", "sample_row": "{\"pair_id\": \"\\\"2900433#6082212\\\"\", \"label\": \"0\", \"id_left\": \"2900433\", \"category_left\": \"\\\"Computers_and_Accessories\\\"\", \"cluster_id_left\": \"387759\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Transcend 64GB microSDXC UHS-I 300x, Class 10\\\\...\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"6082212\", \"category_right\": \"\\\"Camera_and_Photo\\\"\", \"cluster_id_right\": \"368922\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"TARJETA SD 32GB SDHC CLASE 10 300X\\\\\\\" 300X | Tr...\", \"description_right\": \"\\\"\\\\\\\"Tipolog\\\\u00eda: Secure Digital analogico; Capac...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_xlarge": {"config_name": "watches_xlarge", "sample_row": "{\"pair_id\": \"\\\"2679850#3297990\\\"\", \"label\": \"0\", \"id_left\": \"2679850\", \"category_left\": \"\\\"Luggage_and_Travel_Gear\\\"\", \"cluster_id_left\": \"1719439\", \"brand_left\": \"\\\"\\\\\\\"\\\\n\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\tPrada\\\\n\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t\\\\t...\", \"title_left\": \"\\\" \\\\\\\" Prada Papaya Saffiano Lux Leather Parabole To...\", \"description_left\": \"\\\"\\\\\\\"\\\\n This stunning Prada Papaya Saffiano L...\", \"price_left\": \"null\", \"specTableContent_left\": \"\\\" Shipping Method Estimated Transit Time Fee per O...\", \"id_right\": \"3297990\", \"category_right\": \"\\\"Sports_and_Outdoors\\\"\", \"cluster_id_right\": \"1631615\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"TomTom Runner 2 Cardio Large Zwart (Zwart) - P...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Categorie Wearables Merk TomTom Product TomTom R...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_large": {"config_name": "watches_large", "sample_row": "{\"pair_id\": \"\\\"50240#16579903\\\"\", \"label\": \"0\", \"id_left\": \"50240\", \"category_left\": \"\\\"Jewelry\\\"\", \"cluster_id_left\": \"8861668\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Bvlgari Bvlgari Watch BBL33WSPGD\\\\\\\" BBL37WSPG 1...\", \"description_left\": \"\\\"\\\\\\\"

A fine watch makes a brilliant statement of...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"16579903\", \"category_right\": \"\\\"Jewelry\\\"\", \"cluster_id_right\": \"12081440\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Bvlgari Tubogas Watch SP35BDSDS.1T\\\\\\\" SP35BSPGD...\", \"description_right\": \"\\\"\\\\\\\"

A fine watch makes a magnificent statement ...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_medium": {"config_name": "watches_medium", "sample_row": "{\"pair_id\": \"\\\"8902994#16287862\\\"\", \"label\": \"0\", \"id_left\": \"8902994\", \"category_left\": \"\\\"Jewelry\\\"\", \"cluster_id_left\": \"1084360\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Mens Visodate Automatic Watch \\\\\\\"@de \\\\\\\"Tissot T...\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"16287862\", \"category_right\": \"\\\"Jewelry\\\"\", \"cluster_id_right\": \"871120\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Tissot T048.417.27.057.01 T-Race\\\\\\\"@es \\\\\\\"Reloj ...\", \"description_right\": \"\\\"\\\\\\\"Reloj Tissot T-Sport T-Race\\\\u00a0T0484172705701...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "watches_small": {"config_name": "watches_small", "sample_row": "{\"pair_id\": \"\\\"17014053#13812379\\\"\", \"label\": \"0\", \"id_left\": \"17014053\", \"category_left\": \"\\\"Jewelry\\\"\", \"cluster_id_left\": \"11012750\", \"brand_left\": \"\\\"\\\\\\\"Rolex\\\\\\\"\\\"\", \"title_left\": \"\\\" \\\\\\\"Rolex Milgauss 116400 GV\\\\\\\" GV Watch | Watchfin...\", \"description_left\": \"\\\"\\\\\\\"This Rolex has undergone a thorough inspection ...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"13812379\", \"category_right\": \"\\\"Jewelry\\\"\", \"cluster_id_right\": \"3893035\", \"brand_right\": \"\\\"\\\\\\\"Cartier\\\\\\\"\\\"\", \"title_right\": \"\\\" \\\\\\\"Cartier Roadster W62004V3\\\\\\\" W62004V3 Watch | W...\", \"description_right\": \"\\\"\\\\\\\"This Cartier has undergone a thorough inspectio...\", \"price_right\": \"null\", \"specTableContent_right\": \"\\\" Name Monthly Repayment Total Amount Cost of Cred...\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_xlarge": {"config_name": "shoes_xlarge", "sample_row": "{\"pair_id\": \"\\\"9725423#5777153\\\"\", \"label\": \"0\", \"id_left\": \"9725423\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"16023037\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Concave Volt + SG - Silver/Blue\\\\\\\"@en \\\\\\\" Concav...\", \"description_left\": \"\\\"\\\\\\\"\\\\n With a pu...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"5777153\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"12588487\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Nike Tiempo Legend V FG - Blu Laguna/Bianco/Vo...\", \"description_right\": \"\\\"\\\\\\\"\\\\n Con una t...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_large": {"config_name": "shoes_large", "sample_row": "{\"pair_id\": \"\\\"1933376#3418973\\\"\", \"label\": \"0\", \"id_left\": \"1933376\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"14154487\", \"brand_left\": \"\\\"\\\\\\\"Nike\\\\\\\"@en\\\"\", \"title_left\": \"\\\" \\\\\\\"Nike Air Max 90 Essential\\\\\\\"@en Essential Black...\", \"description_left\": \"\\\"\\\\\\\"Black/Wolf Grey-White\\\\n537384-053\\\\nFirst introd...\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"3418973\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"14153817\", \"brand_right\": \"\\\"\\\\\\\"Nike\\\\\\\"@en\\\"\", \"title_right\": \"\\\" \\\\\\\"Nike Air Max 90 Essential\\\\\\\"@en Essential Unive...\", \"description_right\": \"\\\"\\\\\\\"University Blue/Pure Platinum-Obsidian-White\\\\n5...\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_medium": {"config_name": "shoes_medium", "sample_row": "{\"pair_id\": \"\\\"8203003#16629600\\\"\", \"label\": \"0\", \"id_left\": \"8203003\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"8515872\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"NIKE AIR MAX 90 ULTRA 2.0 LTR\\\\\\\"@pl \\\\\\\"Nowo\\\\u015...\", \"description_left\": \"\\\"\\\\\\\"Null\\\\\\\"@pl \\\"\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"16629600\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"3222506\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Nike Sportswear Air Max 90 Ultra Moire - Czarn...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}, "shoes_small": {"config_name": "shoes_small", "sample_row": "{\"pair_id\": \"\\\"5479787#15837383\\\"\", \"label\": \"1\", \"id_left\": \"5479787\", \"category_left\": \"\\\"Shoes\\\"\", \"cluster_id_left\": \"2569194\", \"brand_left\": \"null\", \"title_left\": \"\\\" \\\\\\\"Nike Flex 2016 Run - Nero/Bianco/Grigio\\\\\\\"@it \\\\...\", \"description_left\": \"null\", \"price_left\": \"null\", \"specTableContent_left\": \"null\", \"id_right\": \"15837383\", \"category_right\": \"\\\"Shoes\\\"\", \"cluster_id_right\": \"2569194\", \"brand_right\": \"null\", \"title_right\": \"\\\" \\\\\\\"Nike sneaker flex 2016 rn\\\\\\\"@en-gb \\\\\\\"Tudo para ...\", \"description_right\": \"null\", \"price_right\": \"null\", \"specTableContent_right\": \"null\"}", "columns": ["pair_id", "label", "id_left", "category_left", "cluster_id_left", "brand_left", "title_left", "description_left", "price_left", "specTableContent_left", "id_right", "category_right", "cluster_id_right", "brand_right", "title_right", "description_right", "price_right", "specTableContent_right"], "columns_mapping": {"pair_id": "pair_id", "label": "label", "id_left": "id_left", "category_left": "category_left", "cluster_id_left": "cluster_id_left", "brand_left": "brand_left", "title_left": "title_left", "description_left": "description_left", "price_left": "price_left", "specTableContent_left": "specTableContent_left", "id_right": "id_right", "category_right": "category_right", "cluster_id_right": "cluster_id_right", "brand_right": "brand_right", "title_right": "title_right", "description_right": "description_right", "price_right": "price_right", "specTableContent_right": "specTableContent_right"}, "dataset_description": "Many e-shops have started to mark-up product data within their HTML pages using the schema.org vocabulary. The Web Data Commons project regularly extracts such data from the Common Crawl, a large public web crawl. The Web Data Commons Training and Test Sets for Large-Scale Product Matching contain product offers from different e-shops in the form of binary product pairs (with corresponding label \"match\" or \"no match\")\n\nIn order to support the evaluation of machine learning-based matching methods, the data is split into training, validation and test set. We provide training and validation sets in four different sizes for four product categories. The labels of the test sets were manually checked while those of the training sets were derived using shared product identifiers from the Web via weak supervision.\n\nThe data stems from the WDC Product Data Corpus for Large-Scale Product Matching - Version 2.0 which consists of 26 million product offers originating from 79 thousand websites.\n", "dataset_name": "wdc/products-2017"}}, "tags": ["task_categories:text-classification", "annotations_creators:weak supervision", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "strombergnlp/x-stance": {"dataset_name": "strombergnlp/x-stance", "description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.", "downloads": 69, "configs": {"de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question\": \"\\\"Eine Volksinitiative fordert, dass die Gesamtfl\\\\u...\", \"comment\": \"\\\"Eine fixe Gr\\\\u00f6sse verbieten, ist das falsche ...\", \"label\": \"0\"}", "columns": ["id", "question", "comment", "label"], "columns_mapping": {"id": "id", "question": "question", "comment": "comment", "label": "label"}, "dataset_description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.\n", "dataset_name": "strombergnlp/x-stance"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"question\": \"\\\"Seriez-vous favorable \\\\u00e0 ce que l'euthanasie ...\", \"comment\": \"\\\"C'est un sujet d\\\\u00e9licat, tout d\\\\u00e9pend de ...\", \"label\": \"1\"}", "columns": ["id", "question", "comment", "label"], "columns_mapping": {"id": "id", "question": "question", "comment": "comment", "label": "label"}, "dataset_description": "The x-stance dataset contains more than 150 political questions, and 67k comments written by candidates on those questions. The comments are partly German, partly French and Italian. The data have been extracted from the Swiss voting advice platform Smartvote.\n", "dataset_name": "strombergnlp/x-stance"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:multilingual", "language:de", "language:fr", "stance-detection"], "is_gated": false}, "WorkInTheDark/FairytaleQA": {"dataset_name": "WorkInTheDark/FairytaleQA", "description": "FairytaleQA dataset, an open-source dataset focusing on comprehension of narratives, targeting students from kindergarten to eighth grade. The FairytaleQA dataset is annotated by education experts based on an evidence-based theoretical framework. It consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations.", "downloads": 310, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"story_name\": \"\\\"three-dogs\\\"\", \"story_section\": \"\\\"once upon a time there was a king who went forth ...\", \"question\": \"\\\"why was there great rejoicing in the city and thr...\", \"answer1\": \"\\\"the people wished their king all that was good .\\\"...\", \"answer2\": \"\\\"\\\"\", \"local-or-sum\": \"\\\"local\\\"\", \"attribute\": \"\\\"causal relationship\\\"\", \"ex-or-im\": \"\\\"explicit\\\"\", \"ex-or-im2\": \"\\\"\\\"\"}", "columns": ["story_name", "story_section", "question", "answer1", "answer2", "local-or-sum", "attribute", "ex-or-im", "ex-or-im2"], "columns_mapping": {"story_name": "story_name", "story_section": "story_section", "question": "question", "answer1": "answer1", "answer2": "answer2", "local-or-sum": "local-or-sum", "attribute": "attribute", "ex-or-im": "ex-or-im", "ex-or-im2": "ex-or-im2"}, "dataset_description": "FairytaleQA dataset, an open-source dataset focusing on comprehension of narratives, targeting students from kindergarten to eighth grade. The FairytaleQA dataset is annotated by education experts based on an evidence-based theoretical framework. It consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations.\n", "dataset_name": "WorkInTheDark/FairytaleQA"}}, "tags": ["task_categories:question-answering", "task_categories:text-generation", "language:en", "education", "children education"], "is_gated": false}, "strombergnlp/nlpcc-stance": {"dataset_name": "strombergnlp/nlpcc-stance", "description": "This is a stance prediction dataset in Chinese.\nThe data is that from a shared task, stance detection in Chinese microblogs, in NLPCC-ICCPOL 2016. It covers Task A, a mandatory supervised task which detects stance towards five targets of interest with given labeled data.", "downloads": 25, "configs": {"task_a": {"config_name": "task_a", "sample_row": "{\"id\": \"\\\"0\\\"\", \"target\": \"\\\"IphoneSE\\\"\", \"text\": \"\\\"3\\\\u670831\\\\u65e5\\\\uff0c\\\\u82f9\\\\u679ciPhone SE\\\\u6b63\\\\...\", \"stance\": \"2\"}", "columns": ["id", "target", "text", "stance"], "columns_mapping": {"id": "id", "target": "target", "text": "text", "stance": "stance"}, "dataset_description": "This is a stance prediction dataset in Chinese.\nThe data is that from a shared task, stance detection in Chinese microblogs, in NLPCC-ICCPOL 2016. It covers Task A, a mandatory supervised task which detects stance towards five targets of interest with given labeled data. \n", "dataset_name": "strombergnlp/nlpcc-stance"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-analysis", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:zh", "stance-detection"], "is_gated": false}, "GEM/FairytaleQA": {"dataset_name": "GEM/FairytaleQA", "description": "\\\r\nThe FairytaleQA dataset focusing on narrative comprehension of kindergarten to eighth-grade students. Generated by educational experts based on an evidence-based theoretical framework, FairytaleQA consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations. This is for the Question Generation Task of FairytaleQA.", "downloads": 29, "configs": {"default": {"config_name": "default", "sample_row": "{\"story_name\": \"\\\"three-dogs\\\"\", \"content\": \"\\\"once upon a time there was a king who went forth ...\", \"answer\": \"\\\"the people wished their king all that was good .\\\"...\", \"question\": \"\\\"why was there great rejoicing in the city and thr...\", \"gem_id\": \"\\\"GEM-FairytaleQA-train-0\\\"\", \"target\": \"\\\"why was there great rejoicing in the city and thr...\", \"references\": \"[]\", \"local_or_sum\": \"\\\"local\\\"\", \"attribute\": \"\\\"causal relationship\\\"\", \"ex_or_im\": \"\\\"explicit\\\"\"}", "columns": ["story_name", "content", "answer", "question", "gem_id", "target", "references", "local_or_sum", "attribute", "ex_or_im"], "columns_mapping": {"story_name": "story_name", "content": "content", "answer": "answer", "question": "question", "gem_id": "gem_id", "target": "target", "references": "references", "local_or_sum": "local_or_sum", "attribute": "attribute", "ex_or_im": "ex_or_im"}, "dataset_description": "The FairytaleQA dataset focusing on narrative comprehension of kindergarten to eighth-grade students. Generated by educational experts based on an evidence-based theoretical framework, FairytaleQA consists of 10,580 explicit and implicit questions derived from 278 children-friendly stories, covering seven types of narrative elements or relations. This is for the Question Generation Task of FairytaleQA.\n", "dataset_name": "GEM/FairytaleQA"}}, "tags": ["task_categories:other", "annotations_creators:expert-created", "multilinguality:unknown", "source_datasets:original", "language:en", "question-generation"], "is_gated": false}, "strombergnlp/ans-stance": {"dataset_name": "strombergnlp/ans-stance", "description": "The dataset is a collection of news titles in arabic along with paraphrased and corrupted titles. The stance prediction version is a 3-class classification task. Data contains three columns: s1, s2, stance.", "downloads": 20, "configs": {"stance": {"config_name": "stance", "sample_row": "{\"id\": \"\\\"0\\\"\", \"s1\": \"\\\"\\\\u0647\\\\u062c\\\\u0648\\\\u0645 \\\\u0635\\\\u0627\\\\u0631\\\\u0648...\", \"s2\": \"\\\"\\\\u0647\\\\u062f\\\\u0648\\\\u0621 \\\\u0627\\\\u0644\\\\u0627\\\\u0634...\", \"stance\": \"0\"}", "columns": ["id", "s1", "s2", "stance"], "columns_mapping": {"id": "id", "s1": "s1", "s2": "s2", "stance": "stance"}, "dataset_description": "The dataset is a collection of news titles in arabic along with paraphrased and corrupted titles. The stance prediction version is a 3-class classification task. Data contains three columns: s1, s2, stance.\n", "dataset_name": "strombergnlp/ans-stance"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ar", "stance-detection"], "is_gated": false}, "launch/gov_report": {"dataset_name": "launch/gov_report", "description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure", "downloads": 1045, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"GAO_GAO-06-1085\\\"\", \"document\": \"\\\"Background The structure of the armed forces is b...\", \"summary\": \"\\\"As the Department of Defense (DOD) has expanded i...\"}", "columns": ["id", "document", "summary"], "columns_mapping": {"id": "id", "document": "document", "summary": "summary"}, "dataset_description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure\n", "dataset_name": "launch/gov_report"}, "plain_text_with_recommendations": {"config_name": "plain_text_with_recommendations", "sample_row": "{\"id\": \"\\\"GAO_GAO-06-1085\\\"\", \"document\": \"\\\"Background The structure of the armed forces is b...\", \"summary\": \"\\\"As the Department of Defense (DOD) has expanded i...\"}", "columns": ["id", "document", "summary"], "columns_mapping": {"id": "id", "document": "document", "summary": "summary"}, "dataset_description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure\n", "dataset_name": "launch/gov_report"}, "structure": {"config_name": "structure", "sample_row": "{\"id\": \"\\\"GAO_GAO-06-1085\\\"\", \"document_sections.title\": \"[\\\"Background\\\", \\\"DOD Has Established Force Health P...\", \"document_sections.paragraphs\": \"[\\\"The structure of the armed forces is based on th...\", \"document_sections.depth\": \"[1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 2, 3, 3, 3, 2, 1, 1...\", \"summary_sections.title\": \"[\\\"Why GAO Did This Study\\\", \\\"What GAO Found\\\"]\", \"summary_sections.paragraphs\": \"[\\\"As the Department of Defense (DOD) has expanded ...\"}", "columns": ["id", "document_sections_title", "document_sections_paragraphs", "document_sections_depth", "summary_sections_title", "summary_sections_paragraphs"], "columns_mapping": {"id": "id", "document_sections.title": "document_sections_title", "document_sections.paragraphs": "document_sections_paragraphs", "document_sections.depth": "document_sections_depth", "summary_sections.title": "summary_sections_title", "summary_sections.paragraphs": "summary_sections_paragraphs"}, "dataset_description": "GovReport long document summarization dataset.\n\nThere are three configs:\n - plain_text: plain text document-to-summary pairs\n - plain_text_with_recommendations: plain text doucment-summary pairs, with \"What GAO recommends\" included in the summary\n - structure: data with section structure\n", "dataset_name": "launch/gov_report"}}, "tags": ["task_categories:summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ekinakyurek/ftrace": {"dataset_name": "ekinakyurek/ftrace", "description": " Factual Tracing Dataset that contains queries and abstracts, and their corresponding ground truth.", "downloads": 91, "configs": {"abstracts": {"config_name": "abstracts", "sample_row": "{\"inputs_pretokenized\": \"\\\"The Austroasiatic languages, in recent classifica...\", \"targets_pretokenized\": \"\\\" Bangladesh\\\"\", \"masked_uri\": \"\\\"Q902\\\"\", \"masked_type\": \"\\\"object\\\"\", \"facts\": \"\\\"P31,Q25295,Q33199;P47,Q902,Q668;P47,Q837,Q668;P47...\", \"id\": \"\\\"3\\\"\", \"example_uris\": \"\\\"Q33199-0-Q902-Q668-0;Q33199-0-Q668-Q902-1\\\"\", \"page_uri\": \"\\\"Q33199\\\"\"}", "columns": ["inputs_pretokenized", "targets_pretokenized", "masked_uri", "masked_type", "facts", "id", "example_uris", "page_uri"], "columns_mapping": {"inputs_pretokenized": "inputs_pretokenized", "targets_pretokenized": "targets_pretokenized", "masked_uri": "masked_uri", "masked_type": "masked_type", "facts": "facts", "id": "id", "example_uris": "example_uris", "page_uri": "page_uri"}, "dataset_description": " Factual Tracing Dataset that contains queries and abstracts, and their corresponding ground truth.\n Abstracts based on TREx dataset.\n", "dataset_name": "ekinakyurek/ftrace"}, "queries": {"config_name": "queries", "sample_row": "{\"inputs_pretokenized\": \"\\\"Member of the Scottish Parliament is a legal term...\", \"targets_pretokenized\": \"\\\" Scotland\\\"\", \"uuid\": \"\\\"0eb8ef92-c539-4498-b845-0c7f6d415b71\\\"\", \"obj_uri\": \"\\\"Q22\\\"\", \"sub_uri\": \"\\\"Q1711695\\\"\", \"predicate_id\": \"\\\"P1001\\\"\", \"sub_surface\": \"\\\"Member of the Scottish Parliament\\\"\", \"obj_surface\": \"\\\"Scotland\\\"\"}", "columns": ["inputs_pretokenized", "targets_pretokenized", "uuid", "obj_uri", "sub_uri", "predicate_id", "sub_surface", "obj_surface"], "columns_mapping": {"inputs_pretokenized": "inputs_pretokenized", "targets_pretokenized": "targets_pretokenized", "uuid": "uuid", "obj_uri": "obj_uri", "sub_uri": "sub_uri", "predicate_id": "predicate_id", "sub_surface": "sub_surface", "obj_surface": "obj_surface"}, "dataset_description": " Factual Tracing Dataset that contains queries and abstracts, and their corresponding ground truth.\n Queries based on LAMA dataset.\n", "dataset_name": "ekinakyurek/ftrace"}}, "tags": ["task_ids:masked-language-modeling", "multilinguality:monolingual", "source_datasets:TRex", "source_datasets:Lama", "language:en"], "is_gated": false}, "GroNLP/divemt": {"dataset_name": "GroNLP/divemt", "description": "DivEMT is the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times, pauses, and perceived effort were logged, enabling an in-depth, cross-lingual evaluation of NMT quality and its post-editing process.", "downloads": 32, "configs": {"warmup": {"config_name": "warmup", "sample_row": "{\"unit_id\": \"\\\"flores101-warmup-tur-1-ht-1\\\"\", \"flores_id\": \"163\", \"item_id\": \"\\\"flores101-warmup-11\\\"\", \"subject_id\": \"\\\"tur_t1\\\"\", \"lang_id\": \"\\\"tur\\\"\", \"doc_id\": \"1\", \"task_type\": \"\\\"ht\\\"\", \"translation_type\": \"\\\"ht\\\"\", \"src_len_chr\": \"189\", \"mt_len_chr\": \"NaN\", \"tgt_len_chr\": \"192\", \"src_len_wrd\": \"29\", \"mt_len_wrd\": \"NaN\", \"tgt_len_wrd\": \"27\", \"edit_time\": \"88.624\", \"k_total\": \"660\", \"k_letter\": \"472\", \"k_digit\": \"0\", \"k_white\": \"68\", \"k_symbol\": \"14\", \"k_nav\": \"72\", \"k_erase\": \"34\", \"k_copy\": \"0\", \"k_cut\": \"0\", \"k_paste\": \"0\", \"k_do\": \"0\", \"n_pause_geq_300\": \"37\", \"len_pause_geq_300\": \"728697\", \"n_pause_geq_1000\": \"14\", \"len_pause_geq_1000\": \"716844\", \"event_time\": \"761283\", \"num_annotations\": \"1\", \"last_modification_time\": \"1642600367\", \"n_insert\": \"NaN\", \"n_delete\": \"NaN\", \"n_substitute\": \"NaN\", \"n_shift\": \"NaN\", \"tot_shifted_words\": \"NaN\", \"tot_edits\": \"NaN\", \"hter\": \"NaN\", \"cer\": \"NaN\", \"bleu\": \"NaN\", \"chrf\": \"NaN\", \"time_s\": \"761.283\", \"time_m\": \"12.688\", \"time_h\": \"0.2115\", \"time_per_char\": \"4.028\", \"time_per_word\": \"26.2511\", \"key_per_char\": \"3.4921\", \"words_per_hour\": \"137.1369\", \"words_per_minute\": \"2.2856\", \"per_subject_visit_order\": \"1\", \"src_text\": \"\\\"In France, voting has traditionally been a low-te...\", \"mt_text\": \"\\\"nan\\\"\", \"tgt_text\": \"\\\"Fransa'da oy verme deneyimi geleneksel olarak pek...\", \"aligned_edit\": \"\\\"nan\\\"\", \"src_tokens\": \"[\\\"In\\\", \\\"France\\\", \\\",\\\", \\\"voting\\\", \\\"has\\\", \\\"traditiona...\", \"src_annotations.lemma\": \"[\\\"in\\\", \\\"France\\\", \\\",\\\", \\\"voting\\\", \\\"have\\\", \\\"tradition...\", \"src_annotations.upos\": \"[\\\"ADP\\\", \\\"PROPN\\\", \\\"PUNCT\\\", \\\"NOUN\\\", \\\"AUX\\\", \\\"ADV\\\", \\\"A...\", \"src_annotations.feats\": \"[\\\"\\\", \\\"Number=Sing\\\", \\\"\\\", \\\"Number=Sing\\\", \\\"Mood=Ind|N...\", \"src_annotations.head\": \"[\\\"2\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"12\\\", \\\"1...\", \"src_annotations.deprel\": \"[\\\"case\\\", \\\"obl\\\", \\\"punct\\\", \\\"nsubj\\\", \\\"aux\\\", \\\"advmod\\\",...\", \"src_annotations.start_char\": \"[0, 3, 9, 11, 18, 22, 36, 41, 43, 46, 47, 52, 62, ...\", \"src_annotations.end_char\": \"[2, 9, 10, 17, 21, 35, 40, 42, 46, 47, 51, 62, 63,...\", \"src_annotations.ner\": \"[\\\"O\\\", \\\"S-GPE\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", ...\", \"mt_tokens\": \"[]\", \"mt_annotations.lemma\": \"[]\", \"mt_annotations.upos\": \"[]\", \"mt_annotations.feats\": \"[]\", \"mt_annotations.head\": \"[]\", \"mt_annotations.deprel\": \"[]\", \"mt_annotations.start_char\": \"[]\", \"mt_annotations.end_char\": \"[]\", \"mt_annotations.ner\": \"[]\", \"tgt_tokens\": \"[\\\"Fransa'da\\\", \\\"oy\\\", \\\"verme\\\", \\\"deneyimi\\\", \\\"geleneks...\", \"tgt_annotations.lemma\": \"[\\\"Fransa\\\", \\\"oy\\\", \\\"ver\\\", \\\"deneyim\\\", \\\"geleneksel\\\", \\\"...\", \"tgt_annotations.upos\": \"[\\\"PROPN\\\", \\\"NOUN\\\", \\\"VERB\\\", \\\"NOUN\\\", \\\"ADJ\\\", \\\"ADP\\\", \\\"A...\", \"tgt_annotations.feats\": \"[\\\"Case=Loc|Number=Sing|Person=3\\\", \\\"Case=Nom|Number...\", \"tgt_annotations.head\": \"[\\\"2\\\", \\\"4\\\", \\\"2\\\", \\\"11\\\", \\\"9\\\", \\\"5\\\", \\\"8\\\", \\\"9\\\", \\\"11\\\", \\\"1...\", \"tgt_annotations.deprel\": \"[\\\"nmod\\\", \\\"nmod:poss\\\", \\\"compound\\\", \\\"nsubj\\\", \\\"amod\\\",...\", \"tgt_annotations.start_char\": \"[0, 10, 13, 19, 28, 39, 46, 50, 61, 69, 73, 81, 83...\", \"tgt_annotations.end_char\": \"[9, 12, 18, 27, 38, 45, 49, 60, 68, 72, 81, 82, 85...\", \"tgt_annotations.ner\": \"[\\\"S-LOCATION\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", ...\", \"src_wmt22_qe\": \"[]\", \"mt_wmt22_qe\": \"[]\"}", "columns": ["unit_id", "flores_id", "item_id", "subject_id", "lang_id", "doc_id", "task_type", "translation_type", "src_len_chr", "mt_len_chr", "tgt_len_chr", "src_len_wrd", "mt_len_wrd", "tgt_len_wrd", "edit_time", "k_total", "k_letter", "k_digit", "k_white", "k_symbol", "k_nav", "k_erase", "k_copy", "k_cut", "k_paste", "k_do", "n_pause_geq_300", "len_pause_geq_300", "n_pause_geq_1000", "len_pause_geq_1000", "event_time", "num_annotations", "last_modification_time", "n_insert", "n_delete", "n_substitute", "n_shift", "tot_shifted_words", "tot_edits", "hter", "cer", "bleu", "chrf", "time_s", "time_m", "time_h", "time_per_char", "time_per_word", "key_per_char", "words_per_hour", "words_per_minute", "per_subject_visit_order", "src_text", "mt_text", "tgt_text", "aligned_edit", "src_tokens", "src_annotations_lemma", "src_annotations_upos", "src_annotations_feats", "src_annotations_head", "src_annotations_deprel", "src_annotations_start_char", "src_annotations_end_char", "src_annotations_ner", "mt_tokens", "mt_annotations_lemma", "mt_annotations_upos", "mt_annotations_feats", "mt_annotations_head", "mt_annotations_deprel", "mt_annotations_start_char", "mt_annotations_end_char", "mt_annotations_ner", "tgt_tokens", "tgt_annotations_lemma", "tgt_annotations_upos", "tgt_annotations_feats", "tgt_annotations_head", "tgt_annotations_deprel", "tgt_annotations_start_char", "tgt_annotations_end_char", "tgt_annotations_ner", "src_wmt22_qe", "mt_wmt22_qe"], "columns_mapping": {"unit_id": "unit_id", "flores_id": "flores_id", "item_id": "item_id", "subject_id": "subject_id", "lang_id": "lang_id", "doc_id": "doc_id", "task_type": "task_type", "translation_type": "translation_type", "src_len_chr": "src_len_chr", "mt_len_chr": "mt_len_chr", "tgt_len_chr": "tgt_len_chr", "src_len_wrd": "src_len_wrd", "mt_len_wrd": "mt_len_wrd", "tgt_len_wrd": "tgt_len_wrd", "edit_time": "edit_time", "k_total": "k_total", "k_letter": "k_letter", "k_digit": "k_digit", "k_white": "k_white", "k_symbol": "k_symbol", "k_nav": "k_nav", "k_erase": "k_erase", "k_copy": "k_copy", "k_cut": "k_cut", "k_paste": "k_paste", "k_do": "k_do", "n_pause_geq_300": "n_pause_geq_300", "len_pause_geq_300": "len_pause_geq_300", "n_pause_geq_1000": "n_pause_geq_1000", "len_pause_geq_1000": "len_pause_geq_1000", "event_time": "event_time", "num_annotations": "num_annotations", "last_modification_time": "last_modification_time", "n_insert": "n_insert", "n_delete": "n_delete", "n_substitute": "n_substitute", "n_shift": "n_shift", "tot_shifted_words": "tot_shifted_words", "tot_edits": "tot_edits", "hter": "hter", "cer": "cer", "bleu": "bleu", "chrf": "chrf", "time_s": "time_s", "time_m": "time_m", "time_h": "time_h", "time_per_char": "time_per_char", "time_per_word": "time_per_word", "key_per_char": "key_per_char", "words_per_hour": "words_per_hour", "words_per_minute": "words_per_minute", "per_subject_visit_order": "per_subject_visit_order", "src_text": "src_text", "mt_text": "mt_text", "tgt_text": "tgt_text", "aligned_edit": "aligned_edit", "src_tokens": "src_tokens", "src_annotations.lemma": "src_annotations_lemma", "src_annotations.upos": "src_annotations_upos", "src_annotations.feats": "src_annotations_feats", "src_annotations.head": "src_annotations_head", "src_annotations.deprel": "src_annotations_deprel", "src_annotations.start_char": "src_annotations_start_char", "src_annotations.end_char": "src_annotations_end_char", "src_annotations.ner": "src_annotations_ner", "mt_tokens": "mt_tokens", "mt_annotations.lemma": "mt_annotations_lemma", "mt_annotations.upos": "mt_annotations_upos", "mt_annotations.feats": "mt_annotations_feats", "mt_annotations.head": "mt_annotations_head", "mt_annotations.deprel": "mt_annotations_deprel", "mt_annotations.start_char": "mt_annotations_start_char", "mt_annotations.end_char": "mt_annotations_end_char", "mt_annotations.ner": "mt_annotations_ner", "tgt_tokens": "tgt_tokens", "tgt_annotations.lemma": "tgt_annotations_lemma", "tgt_annotations.upos": "tgt_annotations_upos", "tgt_annotations.feats": "tgt_annotations_feats", "tgt_annotations.head": "tgt_annotations_head", "tgt_annotations.deprel": "tgt_annotations_deprel", "tgt_annotations.start_char": "tgt_annotations_start_char", "tgt_annotations.end_char": "tgt_annotations_end_char", "tgt_annotations.ner": "tgt_annotations_ner", "src_wmt22_qe": "src_wmt22_qe", "mt_wmt22_qe": "mt_wmt22_qe"}, "dataset_description": "DivEMT is the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times, pauses, and perceived effort were logged, enabling an in-depth, cross-lingual evaluation of NMT quality and its post-editing process.\n", "dataset_name": "GroNLP/divemt"}, "main": {"config_name": "main", "sample_row": "{\"unit_id\": \"\\\"flores101-main-tur-1-ht-1\\\"\", \"flores_id\": \"205\", \"item_id\": \"\\\"flores101-main-11\\\"\", \"subject_id\": \"\\\"tur_t1\\\"\", \"lang_id\": \"\\\"tur\\\"\", \"doc_id\": \"1\", \"task_type\": \"\\\"ht\\\"\", \"translation_type\": \"\\\"ht\\\"\", \"src_len_chr\": \"155\", \"mt_len_chr\": \"NaN\", \"tgt_len_chr\": \"147\", \"src_len_wrd\": \"25\", \"mt_len_wrd\": \"NaN\", \"tgt_len_wrd\": \"18\", \"edit_time\": \"100.306\", \"k_total\": \"260\", \"k_letter\": \"181\", \"k_digit\": \"4\", \"k_white\": \"26\", \"k_symbol\": \"5\", \"k_nav\": \"20\", \"k_erase\": \"24\", \"k_copy\": \"0\", \"k_cut\": \"0\", \"k_paste\": \"0\", \"k_do\": \"0\", \"n_pause_geq_300\": \"34\", \"len_pause_geq_300\": \"66281\", \"n_pause_geq_1000\": \"10\", \"len_pause_geq_1000\": \"52888\", \"event_time\": \"100306\", \"num_annotations\": \"1\", \"last_modification_time\": \"1643314612\", \"n_insert\": \"NaN\", \"n_delete\": \"NaN\", \"n_substitute\": \"NaN\", \"n_shift\": \"NaN\", \"tot_shifted_words\": \"NaN\", \"tot_edits\": \"NaN\", \"hter\": \"NaN\", \"cer\": \"NaN\", \"bleu\": \"NaN\", \"chrf\": \"NaN\", \"time_s\": \"100.306\", \"time_m\": \"1.6718\", \"time_h\": \"0.0279\", \"time_per_char\": \"0.6471\", \"time_per_word\": \"4.0122\", \"key_per_char\": \"1.6774\", \"words_per_hour\": \"897.2544\", \"words_per_minute\": \"14.9542\", \"per_subject_visit_order\": \"1\", \"src_text\": \"\\\"UN peacekeepers, whom arrived in Haiti after the ...\", \"mt_text\": \"\\\"nan\\\"\", \"tgt_text\": \"\\\"2010 depreminden sonra Haiti'ye giden BM arabuluc...\", \"aligned_edit\": \"\\\"nan\\\"\", \"src_tokens\": \"[\\\"UN\\\", \\\"peacekeepers\\\", \\\",\\\", \\\"whom\\\", \\\"arrived\\\", \\\"in...\", \"src_annotations.lemma\": \"[\\\"UN\\\", \\\"peacekeeper\\\", \\\",\\\", \\\"whom\\\", \\\"arrive\\\", \\\"in\\\",...\", \"src_annotations.upos\": \"[\\\"PROPN\\\", \\\"NOUN\\\", \\\"PUNCT\\\", \\\"PRON\\\", \\\"VERB\\\", \\\"ADP\\\", ...\", \"src_annotations.feats\": \"[\\\"Number=Sing\\\", \\\"Number=Plur\\\", \\\"\\\", \\\"PronType=Rel\\\",...\", \"src_annotations.head\": \"[\\\"2\\\", \\\"15\\\", \\\"5\\\", \\\"5\\\", \\\"2\\\", \\\"7\\\", \\\"5\\\", \\\"11\\\", \\\"11\\\", \\\"...\", \"src_annotations.deprel\": \"[\\\"compound\\\", \\\"nsubj:pass\\\", \\\"punct\\\", \\\"nsubj\\\", \\\"acl:...\", \"src_annotations.start_char\": \"[0, 3, 15, 17, 22, 30, 33, 39, 45, 49, 54, 64, 66,...\", \"src_annotations.end_char\": \"[2, 15, 16, 21, 29, 32, 38, 44, 48, 53, 64, 65, 69...\", \"src_annotations.ner\": \"[\\\"S-ORG\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"O\\\", \\\"S-GPE\\\", \\\"O\\\", \\\"...\", \"mt_tokens\": \"[]\", \"mt_annotations.lemma\": \"[]\", \"mt_annotations.upos\": \"[]\", \"mt_annotations.feats\": \"[]\", \"mt_annotations.head\": \"[]\", \"mt_annotations.deprel\": \"[]\", \"mt_annotations.start_char\": \"[]\", \"mt_annotations.end_char\": \"[]\", \"mt_annotations.ner\": \"[]\", \"tgt_tokens\": \"[\\\"2010\\\", \\\"depreminden\\\", \\\"sonra\\\", \\\"Haiti'ye\\\", \\\"gide...\", \"tgt_annotations.lemma\": \"[\\\"2010\\\", \\\"deprem\\\", \\\"sonra\\\", \\\"Haiti\\\", \\\"git\\\", \\\"Bm\\\", ...\", \"tgt_annotations.upos\": \"[\\\"NUM\\\", \\\"NOUN\\\", \\\"ADP\\\", \\\"PROPN\\\", \\\"VERB\\\", \\\"NOUN\\\", \\\"N...\", \"tgt_annotations.feats\": \"[\\\"Case=Nom|NumType=Card|Number=Sing|Person=3\\\", \\\"Ca...\", \"tgt_annotations.head\": \"[\\\"5\\\", \\\"1\\\", \\\"2\\\", \\\"5\\\", \\\"7\\\", \\\"7\\\", \\\"10\\\", \\\"9\\\", \\\"10\\\", \\\"1...\", \"tgt_annotations.deprel\": \"[\\\"nummod\\\", \\\"flat\\\", \\\"case\\\", \\\"obl\\\", \\\"acl\\\", \\\"nmod:pos...\", \"tgt_annotations.start_char\": \"[0, 5, 17, 23, 32, 38, 41, 58, 67, 75, 85, 94, 102...\", \"tgt_annotations.end_char\": \"[4, 16, 22, 31, 37, 40, 57, 66, 74, 84, 93, 101, 1...\", \"tgt_annotations.ner\": \"[\\\"S-TIME\\\", \\\"O\\\", \\\"O\\\", \\\"S-LOCATION\\\", \\\"O\\\", \\\"S-ORGANIZ...\", \"src_wmt22_qe\": \"[]\", \"mt_wmt22_qe\": \"[]\"}", "columns": ["unit_id", "flores_id", "item_id", "subject_id", "lang_id", "doc_id", "task_type", "translation_type", "src_len_chr", "mt_len_chr", "tgt_len_chr", "src_len_wrd", "mt_len_wrd", "tgt_len_wrd", "edit_time", "k_total", "k_letter", "k_digit", "k_white", "k_symbol", "k_nav", "k_erase", "k_copy", "k_cut", "k_paste", "k_do", "n_pause_geq_300", "len_pause_geq_300", "n_pause_geq_1000", "len_pause_geq_1000", "event_time", "num_annotations", "last_modification_time", "n_insert", "n_delete", "n_substitute", "n_shift", "tot_shifted_words", "tot_edits", "hter", "cer", "bleu", "chrf", "time_s", "time_m", "time_h", "time_per_char", "time_per_word", "key_per_char", "words_per_hour", "words_per_minute", "per_subject_visit_order", "src_text", "mt_text", "tgt_text", "aligned_edit", "src_tokens", "src_annotations_lemma", "src_annotations_upos", "src_annotations_feats", "src_annotations_head", "src_annotations_deprel", "src_annotations_start_char", "src_annotations_end_char", "src_annotations_ner", "mt_tokens", "mt_annotations_lemma", "mt_annotations_upos", "mt_annotations_feats", "mt_annotations_head", "mt_annotations_deprel", "mt_annotations_start_char", "mt_annotations_end_char", "mt_annotations_ner", "tgt_tokens", "tgt_annotations_lemma", "tgt_annotations_upos", "tgt_annotations_feats", "tgt_annotations_head", "tgt_annotations_deprel", "tgt_annotations_start_char", "tgt_annotations_end_char", "tgt_annotations_ner", "src_wmt22_qe", "mt_wmt22_qe"], "columns_mapping": {"unit_id": "unit_id", "flores_id": "flores_id", "item_id": "item_id", "subject_id": "subject_id", "lang_id": "lang_id", "doc_id": "doc_id", "task_type": "task_type", "translation_type": "translation_type", "src_len_chr": "src_len_chr", "mt_len_chr": "mt_len_chr", "tgt_len_chr": "tgt_len_chr", "src_len_wrd": "src_len_wrd", "mt_len_wrd": "mt_len_wrd", "tgt_len_wrd": "tgt_len_wrd", "edit_time": "edit_time", "k_total": "k_total", "k_letter": "k_letter", "k_digit": "k_digit", "k_white": "k_white", "k_symbol": "k_symbol", "k_nav": "k_nav", "k_erase": "k_erase", "k_copy": "k_copy", "k_cut": "k_cut", "k_paste": "k_paste", "k_do": "k_do", "n_pause_geq_300": "n_pause_geq_300", "len_pause_geq_300": "len_pause_geq_300", "n_pause_geq_1000": "n_pause_geq_1000", "len_pause_geq_1000": "len_pause_geq_1000", "event_time": "event_time", "num_annotations": "num_annotations", "last_modification_time": "last_modification_time", "n_insert": "n_insert", "n_delete": "n_delete", "n_substitute": "n_substitute", "n_shift": "n_shift", "tot_shifted_words": "tot_shifted_words", "tot_edits": "tot_edits", "hter": "hter", "cer": "cer", "bleu": "bleu", "chrf": "chrf", "time_s": "time_s", "time_m": "time_m", "time_h": "time_h", "time_per_char": "time_per_char", "time_per_word": "time_per_word", "key_per_char": "key_per_char", "words_per_hour": "words_per_hour", "words_per_minute": "words_per_minute", "per_subject_visit_order": "per_subject_visit_order", "src_text": "src_text", "mt_text": "mt_text", "tgt_text": "tgt_text", "aligned_edit": "aligned_edit", "src_tokens": "src_tokens", "src_annotations.lemma": "src_annotations_lemma", "src_annotations.upos": "src_annotations_upos", "src_annotations.feats": "src_annotations_feats", "src_annotations.head": "src_annotations_head", "src_annotations.deprel": "src_annotations_deprel", "src_annotations.start_char": "src_annotations_start_char", "src_annotations.end_char": "src_annotations_end_char", "src_annotations.ner": "src_annotations_ner", "mt_tokens": "mt_tokens", "mt_annotations.lemma": "mt_annotations_lemma", "mt_annotations.upos": "mt_annotations_upos", "mt_annotations.feats": "mt_annotations_feats", "mt_annotations.head": "mt_annotations_head", "mt_annotations.deprel": "mt_annotations_deprel", "mt_annotations.start_char": "mt_annotations_start_char", "mt_annotations.end_char": "mt_annotations_end_char", "mt_annotations.ner": "mt_annotations_ner", "tgt_tokens": "tgt_tokens", "tgt_annotations.lemma": "tgt_annotations_lemma", "tgt_annotations.upos": "tgt_annotations_upos", "tgt_annotations.feats": "tgt_annotations_feats", "tgt_annotations.head": "tgt_annotations_head", "tgt_annotations.deprel": "tgt_annotations_deprel", "tgt_annotations.start_char": "tgt_annotations_start_char", "tgt_annotations.end_char": "tgt_annotations_end_char", "tgt_annotations.ner": "tgt_annotations_ner", "src_wmt22_qe": "src_wmt22_qe", "mt_wmt22_qe": "mt_wmt22_qe"}, "dataset_description": "DivEMT is the first publicly available post-editing study of Neural Machine Translation (NMT) over a typologically diverse set of target languages. Using a strictly controlled setup, 18 professional translators were instructed to translate or post-edit the same set of English documents into Arabic, Dutch, Italian, Turkish, Ukrainian, and Vietnamese. During the process, their edits, keystrokes, editing times, pauses, and perceived effort were logged, enabling an in-depth, cross-lingual evaluation of NMT quality and its post-editing process.\n", "dataset_name": "GroNLP/divemt"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:it", "language:vi", "language:nl", "language:uk", "language:tr", "language:ar"], "is_gated": false}, "mteb/amazon_reviews_multi": {"dataset_name": "mteb/amazon_reviews_multi", "description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.", "downloads": 2978, "configs": {"all_languages": {"config_name": "all_languages", "sample_row": "{\"id\": \"\\\"de_0203609\\\"\", \"text\": \"\\\"Leider nach 1 Jahr kaputt\\\\n\\\\nArmband ist leider n...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"de_0203609\\\"\", \"text\": \"\\\"Leider nach 1 Jahr kaputt\\\\n\\\\nArmband ist leider n...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "en": {"config_name": "en", "sample_row": "{\"id\": \"\\\"en_0964290\\\"\", \"text\": \"\\\"I'll spend twice the amount of time boxing up the...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"es_0491108\\\"\", \"text\": \"\\\"television Nevir\\\\n\\\\nNada bueno se me fue ka panta...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"fr_0424335\\\"\", \"text\": \"\\\"Brumisateur \\\\u00e0 pompe\\\\n\\\\nA d\\\\u00e9conseiller -...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "ja": {"config_name": "ja", "sample_row": "{\"id\": \"\\\"ja_0388536\\\"\", \"text\": \"\\\"\\\\u672c\\\\u9769\\\\u3067\\\\u3082\\\\u9632\\\\u6c34\\\\u3067\\\\u3082\\\\...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}, "zh": {"config_name": "zh", "sample_row": "{\"id\": \"\\\"zh_0626061\\\"\", \"text\": \"\\\"\\\\u6b64\\\\u4e66\\\\u4e0d\\\\u662f\\\\u672c\\\\u4eba\\\\u8d2d\\\\u4e70\\\\...\", \"label\": \"0\", \"label_text\": \"\\\"0\\\"\"}", "columns": ["id", "text", "label", "label_text"], "columns_mapping": {"id": "id", "text": "text", "label": "label", "label_text": "label_text"}, "dataset_description": "We provide an Amazon product reviews dataset for multilingual text classification. The dataset contains reviews in English, Japanese, German, French, Chinese and Spanish, collected between November 1, 2015 and November 1, 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID and the coarse-grained product category (e.g. \u2018books\u2019, \u2018appliances\u2019, etc.) The corpus is balanced across stars, so each star rating constitutes 20% of the reviews in each language.\nFor each language, there are 200,000, 5,000 and 5,000 reviews in the training, development and test sets respectively. The maximum number of reviews per reviewer is 20 and the maximum number of reviews per product is 20. All reviews are truncated after 2,000 characters, and all reviews are at least 20 characters long.\nNote that the language of a review does not necessarily match the language of its marketplace (e.g. reviews from amazon.de are primarily written in German, but could also be written in English, etc.). For this reason, we applied a language detection algorithm based on the work in Bojanowski et al. (2017) to determine the language of the review text and we removed reviews that were not written in the expected language.\n", "dataset_name": "mteb/amazon_reviews_multi"}}, "tags": ["language:de", "language:en", "language:es", "language:fr", "language:ja", "language:zh"], "is_gated": false}, "silver/lccc": {"dataset_name": "silver/lccc", "description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.", "downloads": 14, "configs": {"large": {"config_name": "large", "sample_row": "{\"dialog\": \"[\\\"\\\\u706b\\\\u9505 \\\\u6211 \\\\u5728 \\\\u91cd\\\\u5e86 \\\\u6210\\\\u...\"}", "columns": ["dialog"], "columns_mapping": {"dialog": "dialog"}, "dataset_description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.\n", "dataset_name": "silver/lccc"}, "base": {"config_name": "base", "sample_row": "{\"dialog\": \"[\\\"\\\\u4f60 \\\\u53bb \\\\u90a3\\\\u513f \\\\u7adf\\\\u7136 \\\\u4e0d\\\\u...\"}", "columns": ["dialog"], "columns_mapping": {"dialog": "dialog"}, "dataset_description": "LCCC: Large-scale Cleaned Chinese Conversation corpus (LCCC) is a large corpus of Chinese conversations.\nA rigorous data cleaning pipeline is designed to ensure the quality of the corpus.\nThis pipeline involves a set of rules and several classifier-based filters.\nNoises such as offensive or sensitive words, special symbols, emojis,\ngrammatically incorrect sentences, and incoherent conversations are filtered.\n", "dataset_name": "silver/lccc"}}, "tags": ["task_categories:conversational", "task_ids:dialogue-generation", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:original", "language:zh", "dialogue-response-retrieval"], "is_gated": false}, "enwik8": {"dataset_name": "enwik8", "description": "The dataset is based on the Hutter Prize (http://prize.hutter1.net) and contains the first 10^8 bytes of English Wikipedia in 2006 in XML", "downloads": 6262, "configs": {"enwik8": {"config_name": "enwik8", "sample_row": "{\"text\": \"\\\" It is a one-person job , no problem.\\\"...\", \"paragraph_answer\": \"\\\"This cabinet is very easy to assemble. It says &#...\", \"paragraph_sentence\": \"\\\"This cabinet is very easy to assemble. It says &#...\", \"paragraph_id\": \"\\\"5dd4d824cc027a086d65fde6\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}, "new_wiki": {"config_name": "new_wiki", "sample_row": "{\"answer\": \"\\\"Edison Electric Illuminating Company\\\"\", \"paragraph_question\": \"\\\"question: Consolidated Edison can trace it's root...\", \"question\": \"\\\"Consolidated Edison can trace it's roots back to ...\", \"sentence\": \"\\\"Con Edison's electric business traces its roots b...\", \"paragraph\": \"\\\"Gas and electric service is provided by Consolida...\", \"sentence_answer\": \"\\\"Con Edison's electric business traces its roots b...\", \"paragraph_answer\": \"\\\"Gas and electric service is provided by Consolida...\", \"paragraph_sentence\": \"\\\"Gas and electric service is provided by Consolida...\", \"paragraph_id\": \"\\\"5d66f6322b22cd4dfcfbe7d9\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}, "nyt": {"config_name": "nyt", "sample_row": "{\"answer\": \"\\\"letters\\\"\", \"paragraph_question\": \"\\\"question: Ms. Clyne used facsimiles of what posse...\", \"question\": \"\\\"Ms. Clyne used facsimiles of what possession of E...\", \"sentence\": \"\\\"This time, Ms. Clyne used facsimiles of letters b...\", \"paragraph\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"sentence_answer\": \"\\\"This time, Ms. Clyne used facsimiles of lett...\", \"paragraph_answer\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"paragraph_sentence\": \"\\\"Ms. Clyne is at work on a chamber opera about the...\", \"paragraph_id\": \"\\\"5d704c4ac8e4820a9b66e9f7\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}, "reddit": {"config_name": "reddit", "sample_row": "{\"answer\": \"\\\"pokegenning/romhacking\\\"\", \"paragraph_question\": \"\\\"question: What is the author's main reason for wa...\", \"question\": \"\\\"What is the author's main reason for wanting to h...\", \"sentence\": \"\\\"My main reason for wanting to hax is pokegenning/...\", \"paragraph\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"sentence_answer\": \"\\\"My main reason for wanting to hax is pokegen...\", \"paragraph_answer\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"paragraph_sentence\": \"\\\"Dis/advantages of 10.3 over 9.5? Just haxed my O3...\", \"paragraph_id\": \"\\\"5d9c25298ae5305bc982eff7\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD Shifts](https://modestyachts.github.io/squadshifts-website/index.html) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_squadshifts"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:subjqa", "language:en", "question-generation"], "is_gated": false}, "lmqg/qg_esquad": {"dataset_name": "lmqg/qg_esquad", "description": "[SQuAD-es](https://huggingface.co/datasets/squad_es) dataset for question generation (QG) task.", "downloads": 101, "configs": {"qg_esquad": {"config_name": "qg_esquad", "sample_row": "{\"answer\": \"\\\"8.491.079\\\"\", \"paragraph_question\": \"\\\"question: \\\\u00bfCu\\\\u00e1l es la poblaci\\\\u00f3n de...\", \"question\": \"\\\"\\\\u00bfCu\\\\u00e1l es la poblaci\\\\u00f3n de Nueva Yor...\", \"sentence\": \"\\\"Con una poblaci\\\\u00f3n censada estimada en 2014 d...\", \"paragraph\": \"\\\"Situada en uno de los mayores puertos naturales d...\", \"sentence_answer\": \"\\\"Con una poblaci\\\\u00f3n censada estimada en 2014 d...\", \"paragraph_answer\": \"\\\"Situada en uno de los mayores puertos naturales d...\", \"paragraph_sentence\": \"\\\"Situada en uno de los mayores puertos naturales d...\", \"paragraph_id\": \"\\\"56cf9d81234ae51400d9be1e\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SQuAD-es](https://huggingface.co/datasets/squad_es) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_esquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:squad_es", "language:es", "question-generation"], "is_gated": false}, "lmqg/qg_koquad": {"dataset_name": "lmqg/qg_koquad", "description": "[KorQuAD](https://huggingface.co/datasets/squad_kor_v1) dataset for question generation (QG) task.", "downloads": 97, "configs": {"qg_koquad": {"config_name": "qg_koquad", "sample_row": "{\"answer\": \"\\\"\\\\uace0\\\\ub300 \\\\uc774\\\\uc2a4\\\\ub77c\\\\uc5d8\\\"\", \"paragraph_question\": \"\\\"question: \\\\uc5ed\\\\uc0ac\\\\uc5d0\\\\uc11c \\\\uc784\\\\uae08\\\\u...\", \"question\": \"\\\"\\\\uc5ed\\\\uc0ac\\\\uc5d0\\\\uc11c \\\\uc784\\\\uae08\\\\uc758 \\\\uc5b...\", \"sentence\": \"\\\"\\\\uace0\\\\ub300 \\\\uc774\\\\uc2a4\\\\ub77c\\\\uc5d8 \\\\uc758 \\\\uc5...\", \"paragraph\": \"\\\"\\\\ucc9c\\\\uc0c1\\\\uc758 \\\\ubaa8\\\\ud6c4\\\\uc758 \\\\uc131\\\\uacb...\", \"sentence_answer\": \"\\\" \\\\uace0\\\\ub300 \\\\uc774\\\\uc2a4\\\\ub77c\\\\uc5d8 \\\\...\", \"paragraph_answer\": \"\\\"\\\\ucc9c\\\\uc0c1\\\\uc758 \\\\ubaa8\\\\ud6c4\\\\uc758 \\\\uc131\\\\uacb...\", \"paragraph_sentence\": \"\\\"\\\\ucc9c\\\\uc0c1\\\\uc758 \\\\ubaa8\\\\ud6c4\\\\uc758 \\\\uc131\\\\uacb...\", \"paragraph_id\": \"\\\"6343803-2-2\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[KorQuAD](https://huggingface.co/datasets/squad_kor_v1) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_koquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:squad_es", "language:ko", "question-generation"], "is_gated": false}, "lmqg/qg_ruquad": {"dataset_name": "lmqg/qg_ruquad", "description": "[SberSQuAD](https://huggingface.co/datasets/sberquad) dataset for question generation (QG) task.", "downloads": 92, "configs": {"qg_ruquad": {"config_name": "qg_ruquad", "sample_row": "{\"answer\": \"\\\"1,1 \\\\u043c/\\\\u0441\\\"\", \"paragraph_question\": \"\\\"question: \\\\u0447\\\\u0435\\\\u043c \\\\u0441\\\\u043e\\\\u043e\\\\u...\", \"question\": \"\\\"\\\\u0447\\\\u0435\\\\u043c \\\\u0441\\\\u043e\\\\u043e\\\\u0442\\\\u0432...\", \"sentence\": \"\\\"\\\\u0412 1975 \\\\u0433\\\\u043e\\\\u0434\\\\u0443 XV \\\\u0413\\\\u0...\", \"paragraph\": \"\\\"\\\\u041d\\\\u0430\\\\u0438\\\\u0432\\\\u044b\\\\u0441\\\\u0448\\\\u0430\\\\...\", \"sentence_answer\": \"\\\"\\\\u0412 1975 \\\\u0433\\\\u043e\\\\u0434\\\\u0443 XV \\\\u0413\\\\u0...\", \"paragraph_answer\": \"\\\"\\\\u041d\\\\u0430\\\\u0438\\\\u0432\\\\u044b\\\\u0441\\\\u0448\\\\u0430\\\\...\", \"paragraph_sentence\": \"\\\"\\\\u041d\\\\u0430\\\\u0438\\\\u0432\\\\u044b\\\\u0441\\\\u0448\\\\u0430\\\\...\", \"paragraph_id\": \"\\\"2978\\\"\"}", "columns": ["answer", "paragraph_question", "question", "sentence", "paragraph", "sentence_answer", "paragraph_answer", "paragraph_sentence", "paragraph_id"], "columns_mapping": {"answer": "answer", "paragraph_question": "paragraph_question", "question": "question", "sentence": "sentence", "paragraph": "paragraph", "sentence_answer": "sentence_answer", "paragraph_answer": "paragraph_answer", "paragraph_sentence": "paragraph_sentence", "paragraph_id": "paragraph_id"}, "dataset_description": "[SberSQuAD](https://huggingface.co/datasets/sberquad) dataset for question generation (QG) task.", "dataset_name": "lmqg/qg_ruquad"}}, "tags": ["task_categories:text-generation", "task_ids:language-modeling", "multilinguality:monolingual", "source_datasets:deepset/germanquad", "language:ru", "question-generation"], "is_gated": false}, "acronym_identification": {"dataset_name": "acronym_identification", "description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.", "downloads": 1873, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"TR-0\\\"\", \"tokens\": \"[\\\"What\\\", \\\"is\\\", \\\"here\\\", \\\"called\\\", \\\"controlled\\\", \\\"na...\", \"labels\": \"[4, 4, 4, 4, 0, 2, 2, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4...\"}", "columns": ["id", "tokens", "labels"], "columns_mapping": {"id": "id", "tokens": "tokens", "labels": "labels"}, "dataset_description": "Acronym identification training and development sets for the acronym identification task at SDU@AAAI-21.\n", "dataset_name": "acronym_identification"}}, "tags": ["task_categories:token-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "acronym-identification"], "is_gated": false}, "ade_corpus_v2": {"dataset_name": "ade_corpus_v2", "description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.", "downloads": 2423, "configs": {"Ade_corpus_v2_classification": {"config_name": "Ade_corpus_v2_classification", "sample_row": "{\"text\": \"\\\"Intravenous azithromycin-induced ototoxicity.\\\"\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.\n", "dataset_name": "ade_corpus_v2"}, "Ade_corpus_v2_drug_ade_relation": {"config_name": "Ade_corpus_v2_drug_ade_relation", "sample_row": "{\"text\": \"\\\"Intravenous azithromycin-induced ototoxicity.\\\"\", \"drug\": \"\\\"azithromycin\\\"\", \"effect\": \"\\\"ototoxicity\\\"\", \"indexes.drug.start_char\": \"[12]\", \"indexes.drug.end_char\": \"[24]\", \"indexes.effect.start_char\": \"[33]\", \"indexes.effect.end_char\": \"[44]\"}", "columns": ["text", "drug", "effect", "indexes_drug_start_char", "indexes_drug_end_char", "indexes_effect_start_char", "indexes_effect_end_char"], "columns_mapping": {"text": "text", "drug": "drug", "effect": "effect", "indexes.drug.start_char": "indexes_drug_start_char", "indexes.drug.end_char": "indexes_drug_end_char", "indexes.effect.start_char": "indexes_effect_start_char", "indexes.effect.end_char": "indexes_effect_end_char"}, "dataset_description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.\n", "dataset_name": "ade_corpus_v2"}, "Ade_corpus_v2_drug_dosage_relation": {"config_name": "Ade_corpus_v2_drug_dosage_relation", "sample_row": "{\"text\": \"\\\"An episode of subacute encephalopathy after the i...\", \"drug\": \"\\\"methotrexate\\\"\", \"dosage\": \"\\\"1500 mg/m2\\\"\", \"indexes.drug.start_char\": \"[79]\", \"indexes.drug.end_char\": \"[91]\", \"indexes.dosage.start_char\": \"[93]\", \"indexes.dosage.end_char\": \"[103]\"}", "columns": ["text", "drug", "dosage", "indexes_drug_start_char", "indexes_drug_end_char", "indexes_dosage_start_char", "indexes_dosage_end_char"], "columns_mapping": {"text": "text", "drug": "drug", "dosage": "dosage", "indexes.drug.start_char": "indexes_drug_start_char", "indexes.drug.end_char": "indexes_drug_end_char", "indexes.dosage.start_char": "indexes_dosage_start_char", "indexes.dosage.end_char": "indexes_dosage_end_char"}, "dataset_description": " ADE-Corpus-V2 Dataset: Adverse Drug Reaction Data.\n This is a dataset for Classification if a sentence is ADE-related (True) or not (False) and Relation Extraction between Adverse Drug Event and Drug.\n DRUG-AE.rel provides relations between drugs and adverse effects.\n DRUG-DOSE.rel provides relations between drugs and dosages.\n ADE-NEG.txt provides all sentences in the ADE corpus that DO NOT contain any drug-related adverse effects.\n", "dataset_name": "ade_corpus_v2"}}, "tags": ["task_categories:text-classification", "task_categories:token-classification", "task_ids:coreference-resolution", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "adversarial_qa": {"dataset_name": "adversarial_qa", "description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.", "downloads": 5708, "configs": {"adversarialQA": {"config_name": "adversarialQA", "sample_row": "{\"id\": \"\\\"7ba1e8f4261d3170fcf42e84a81dd749116fae95\\\"\", \"title\": \"\\\"Brain\\\"\", \"context\": \"\\\"Another approach to brain function is to examine ...\", \"question\": \"\\\"What sare the benifts of the blood brain barrir?\\\"...\", \"answers.text\": \"[\\\"isolated from the bloodstream\\\"]\", \"answers.answer_start\": \"[195]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"Combined\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}, "dbidaf": {"config_name": "dbidaf", "sample_row": "{\"id\": \"\\\"821607441c173838196c4d1500c2ab21a044e6b0\\\"\", \"title\": \"\\\"Yale_University\\\"\", \"context\": \"\\\"Slack (2003) compares three groups that conducted...\", \"question\": \"\\\"what year were the research groups compared\\\"\", \"answers.text\": \"[\\\"2003\\\"]\", \"answers.answer_start\": \"[7]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"BiDAF\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}, "dbert": {"config_name": "dbert", "sample_row": "{\"id\": \"\\\"dab017ed8a1c27c6afa2d8618abc3a477a4edffc\\\"\", \"title\": \"\\\"Empiricism\\\"\", \"context\": \"\\\"A generation later, the Irish Anglican bishop, Ge...\", \"question\": \"\\\"what concept is mentioned last?\\\"\", \"answers.text\": \"[\\\"subjective idealism\\\"]\", \"answers.answer_start\": \"[742]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"BERT-Large\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}, "droberta": {"config_name": "droberta", "sample_row": "{\"id\": \"\\\"12cf36866b656dc4f254081fe6796ea1be2f6d43\\\"\", \"title\": \"\\\"Napoleon\\\"\", \"context\": \"\\\"When he became First Consul and later Emperor, Na...\", \"question\": \"\\\"What jewelry like accessories did he wear?\\\"\", \"answers.text\": \"[\\\"L\\\\u00e9gion d'honneur star, medal and ribbon, an...\", \"answers.answer_start\": \"[462]\", \"metadata.split\": \"\\\"train\\\"\", \"metadata.model_in_the_loop\": \"\\\"RoBERTa-Large\\\"\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start", "metadata_split", "metadata_model_in_the_loop"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start", "metadata.split": "metadata_split", "metadata.model_in_the_loop": "metadata_model_in_the_loop"}, "dataset_description": "AdversarialQA is a Reading Comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles using an adversarial model-in-the-loop.\nWe use three different models; BiDAF (Seo et al., 2016), BERT-Large (Devlin et al., 2018), and RoBERTa-Large (Liu et al., 2019) in the annotation loop and construct three datasets; D(BiDAF), D(BERT), and D(RoBERTa), each with 10,000 training examples, 1,000 validation, and 1,000 test examples.\nThe adversarial human annotation paradigm ensures that these datasets consist of questions that current state-of-the-art models (at least the ones used as adversaries in the annotation loop) find challenging.\n", "dataset_name": "adversarial_qa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "aeslc": {"dataset_name": "aeslc", "description": "A collection of email messages of employees in the Enron Corporation.\n\nThere are two features:\n - email_body: email body text.\n - subject_line: email subject text.", "downloads": 1738, "configs": {"default": {"config_name": "default", "sample_row": "{\"email_body\": \"\\\"Greg/Phillip, Attached is the Grande Communicati...\", \"subject_line\": \"\\\"Service Agreement\\\"\"}", "columns": ["email_body", "subject_line"], "columns_mapping": {"email_body": "email_body", "subject_line": "subject_line"}, "dataset_description": "\nA collection of email messages of employees in the Enron Corporation.\n\nThere are two features:\n - email_body: email body text.\n - subject_line: email subject text.\n", "dataset_name": "aeslc"}}, "tags": ["task_categories:summarization", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "aspect-based-summarization", "conversations-summarization", "multi-document-summarization", "email-headline-generation"], "is_gated": false}, "afrikaans_ner_corpus": {"dataset_name": "afrikaans_ner_corpus", "description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.", "downloads": 505, "configs": {"afrikaans_ner_corpus": {"config_name": "afrikaans_ner_corpus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Vertaling\\\", \\\"van\\\", \\\"die\\\", \\\"inligting\\\", \\\"in\\\", \\\"di...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Named entity annotated data from the NCHLT Text Resource Development: Phase II Project, annotated with PERSON, LOCATION, ORGANISATION and MISCELLANEOUS tags.\n", "dataset_name": "afrikaans_ner_corpus"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:af"], "is_gated": false}, "ag_news": {"dataset_name": "ag_news", "description": "AG is a collection of more than 1 million news articles. News articles have been\ngathered from more than 2000 news sources by ComeToMyHead in more than 1 year of\nactivity. ComeToMyHead is an academic news search engine which has been running\nsince July, 2004. The dataset is provided by the academic comunity for research\npurposes in data mining (clustering, classification, etc), information retrieval\n(ranking, search, etc), xml, data compression, data streaming, and any other\nnon-commercial activity. For more information, please refer to the link\nhttp://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .\n\nThe AG's news topic classification dataset is constructed by Xiang Zhang\n(xiang.zhang@nyu.edu) from the dataset above. It is used as a text\nclassification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann\nLeCun. Character-level Convolutional Networks for Text Classification. Advances\nin Neural Information Processing Systems 28 (NIPS 2015).", "downloads": 29760, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Wall St. Bears Claw Back Into the Black (Reuters)...\", \"label\": \"2\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "AG is a collection of more than 1 million news articles. News articles have been\ngathered from more than 2000 news sources by ComeToMyHead in more than 1 year of\nactivity. ComeToMyHead is an academic news search engine which has been running\nsince July, 2004. The dataset is provided by the academic comunity for research\npurposes in data mining (clustering, classification, etc), information retrieval\n(ranking, search, etc), xml, data compression, data streaming, and any other\nnon-commercial activity. For more information, please refer to the link\nhttp://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html .\n\nThe AG's news topic classification dataset is constructed by Xiang Zhang\n(xiang.zhang@nyu.edu) from the dataset above. It is used as a text\nclassification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann\nLeCun. Character-level Convolutional Networks for Text Classification. Advances\nin Neural Information Processing Systems 28 (NIPS 2015).\n", "dataset_name": "ag_news"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ai2_arc": {"dataset_name": "ai2_arc", "description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.", "downloads": 272398, "configs": {"ARC-Challenge": {"config_name": "ARC-Challenge", "sample_row": "{\"id\": \"\\\"Mercury_SC_415702\\\"\", \"question\": \"\\\"George wants to warm his hands quickly by rubbing...\", \"choices.text\": \"[\\\"dry palms\\\", \\\"wet palms\\\", \\\"palms covered with oil...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"A\\\"\"}", "columns": ["id", "question", "choices_text", "choices_label", "answerKey"], "columns_mapping": {"id": "id", "question": "question", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey"}, "dataset_description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "dataset_name": "ai2_arc"}, "ARC-Easy": {"config_name": "ARC-Easy", "sample_row": "{\"id\": \"\\\"Mercury_7220990\\\"\", \"question\": \"\\\"Which factor will most likely cause a person to d...\", \"choices.text\": \"[\\\"a leg muscle relaxing after exercise\\\", \\\"a bacter...\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\"]\", \"answerKey\": \"\\\"B\\\"\"}", "columns": ["id", "question", "choices_text", "choices_label", "answerKey"], "columns_mapping": {"id": "id", "question": "question", "choices.text": "choices_text", "choices.label": "choices_label", "answerKey": "answerKey"}, "dataset_description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "dataset_name": "ai2_arc"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "task_ids:multiple-choice-qa", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ajgt_twitter_ar": {"dataset_name": "ajgt_twitter_ar", "description": "Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.", "downloads": 541, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\" \\\\u0627\\\\u0631\\\\u0628\\\\u062f \\\\u0641\\\\u064a\\\\u0647\\\\u062...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Arabic Jordanian General Tweets (AJGT) Corpus consisted of 1,800 tweets annotated as positive and negative. Modern Standard Arabic (MSA) or Jordanian dialect.\n", "dataset_name": "ajgt_twitter_ar"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "allegro_reviews": {"dataset_name": "allegro_reviews", "description": "Allegro Reviews is a sentiment analysis dataset, consisting of 11,588 product reviews written in Polish and extracted\nfrom Allegro.pl - a popular e-commerce marketplace. Each review contains at least 50 words and has a rating on a scale\nfrom one (negative review) to five (positive review).\n\nWe recommend using the provided train/dev/test split. The ratings for the test set reviews are kept hidden.\nYou can evaluate your model using the online evaluation tool available on klejbenchmark.com.", "downloads": 461, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Jako do ceny dobra. Przyssawka mog\\\\u0142aby by\\\\u0...\", \"rating\": \"3.0\"}", "columns": ["text", "rating"], "columns_mapping": {"text": "text", "rating": "rating"}, "dataset_description": "Allegro Reviews is a sentiment analysis dataset, consisting of 11,588 product reviews written in Polish and extracted\nfrom Allegro.pl - a popular e-commerce marketplace. Each review contains at least 50 words and has a rating on a scale\nfrom one (negative review) to five (positive review).\n\nWe recommend using the provided train/dev/test split. The ratings for the test set reviews are kept hidden.\nYou can evaluate your model using the online evaluation tool available on klejbenchmark.com.\n", "dataset_name": "allegro_reviews"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-scoring", "task_ids:text-scoring", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "amazon_polarity": {"dataset_name": "amazon_polarity", "description": "The Amazon reviews dataset consists of reviews from amazon.\nThe data span a period of 18 years, including ~35 million reviews up to March 2013.\nReviews include product and user information, ratings, and a plaintext review.", "downloads": 11831, "configs": {"amazon_polarity": {"config_name": "amazon_polarity", "sample_row": "{\"label\": \"1\", \"title\": \"\\\"Stuning even for the non-gamer\\\"\", \"content\": \"\\\"This sound track was beautiful! It paints the sen...\"}", "columns": ["label", "title", "content"], "columns_mapping": {"label": "label", "title": "title", "content": "content"}, "dataset_description": "The Amazon reviews dataset consists of reviews from amazon.\nThe data span a period of 18 years, including ~35 million reviews up to March 2013.\nReviews include product and user information, ratings, and a plaintext review.\n", "dataset_name": "amazon_polarity"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ambig_qa": {"dataset_name": "ambig_qa", "description": "AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous. The types of ambiguity are diverse and sometimes subtle, many of which are only apparent after examining evidence provided by a very large text corpus. AMBIGNQ, a dataset with\n14,042 annotations on NQ-OPEN questions containing diverse types of ambiguity.\nWe provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.", "downloads": 832, "configs": {"light": {"config_name": "light", "sample_row": "{\"id\": \"\\\"-4469503464110108318\\\"\", \"question\": \"\\\"When did the simpsons first air on television?\\\"...\", \"annotations.type\": \"[\\\"multipleQAs\\\"]\", \"annotations.answer\": \"[[]]\", \"annotations.qaPairs\": \"[{\\\"question\\\": [\\\"When did the Simpsons first air on...\"}", "columns": ["id", "question", "annotations_type", "annotations_answer", "annotations_qaPairs"], "columns_mapping": {"id": "id", "question": "question", "annotations.type": "annotations_type", "annotations.answer": "annotations_answer", "annotations.qaPairs": "annotations_qaPairs"}, "dataset_description": "AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous. The types of ambiguity are diverse and sometimes subtle, many of which are only apparent after examining evidence provided by a very large text corpus. AMBIGNQ, a dataset with\n14,042 annotations on NQ-OPEN questions containing diverse types of ambiguity.\nWe provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.\n", "dataset_name": "ambig_qa"}, "full": {"config_name": "full", "sample_row": "{\"id\": \"\\\"-4469503464110108318\\\"\", \"question\": \"\\\"When did the simpsons first air on television?\\\"...\", \"annotations.type\": \"[\\\"multipleQAs\\\"]\", \"annotations.answer\": \"[[]]\", \"annotations.qaPairs\": \"[{\\\"question\\\": [\\\"When did the Simpsons first air on...\", \"viewed_doc_titles\": \"[\\\"The Simpsons\\\"]\", \"used_queries.query\": \"[\\\"When did the simpsons first air on television?\\\"]...\", \"used_queries.results\": \"[{\\\"title\\\": [\\\"History of The Simpsons\\\", \\\"The Simpso...\", \"nq_answer\": \"[\\\"December 17 , 1989\\\"]\", \"nq_doc_title\": \"\\\"The Simpsons\\\"\"}", "columns": ["id", "question", "annotations_type", "annotations_answer", "annotations_qaPairs", "viewed_doc_titles", "used_queries_query", "used_queries_results", "nq_answer", "nq_doc_title"], "columns_mapping": {"id": "id", "question": "question", "annotations.type": "annotations_type", "annotations.answer": "annotations_answer", "annotations.qaPairs": "annotations_qaPairs", "viewed_doc_titles": "viewed_doc_titles", "used_queries.query": "used_queries_query", "used_queries.results": "used_queries_results", "nq_answer": "nq_answer", "nq_doc_title": "nq_doc_title"}, "dataset_description": "AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous. The types of ambiguity are diverse and sometimes subtle, many of which are only apparent after examining evidence provided by a very large text corpus. AMBIGNQ, a dataset with\n14,042 annotations on NQ-OPEN questions containing diverse types of ambiguity.\nWe provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.\n", "dataset_name": "ambig_qa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|natural_questions", "source_datasets:original", "language:en"], "is_gated": false}, "amttl": {"dataset_name": "amttl", "description": "Chinese word segmentation (CWS) trained from open source corpus faces dramatic performance drop\nwhen dealing with domain text, especially for a domain with lots of special terms and diverse\nwriting styles, such as the biomedical domain. However, building domain-specific CWS requires\nextremely high annotation cost. In this paper, we propose an approach by exploiting domain-invariant\nknowledge from high resource to low resource domains. Extensive experiments show that our mode\nachieves consistently higher accuracy than the single-task CWS and other transfer learning\nbaselines, especially when there is a large disparity between source and target domains.\n\nThis dataset is the accompanied medical Chinese word segmentation (CWS) dataset.\nThe tags are in BIES scheme.\n\nFor more details see https://www.aclweb.org/anthology/C18-1307/", "downloads": 358, "configs": {"amttl": {"config_name": "amttl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u5341\\\", \\\"\\\\u5e74\\\", \\\"\\\\u524d\\\", \\\"\\\\u5f97\\\", \\\"\\\\u7684\\\",...\", \"tags\": \"[0, 2, 3, 3, 3, 0, 2, 3, 0, 2, 0, 2, 0, 2, 0, 2, 3...\"}", "columns": ["id", "tokens", "tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "tags": "tags"}, "dataset_description": "Chinese word segmentation (CWS) trained from open source corpus faces dramatic performance drop\nwhen dealing with domain text, especially for a domain with lots of special terms and diverse\nwriting styles, such as the biomedical domain. However, building domain-specific CWS requires\nextremely high annotation cost. In this paper, we propose an approach by exploiting domain-invariant\nknowledge from high resource to low resource domains. Extensive experiments show that our mode\nachieves consistently higher accuracy than the single-task CWS and other transfer learning\nbaselines, especially when there is a large disparity between source and target domains.\n\nThis dataset is the accompanied medical Chinese word segmentation (CWS) dataset.\nThe tags are in BIES scheme.\n\nFor more details see https://www.aclweb.org/anthology/C18-1307/\n", "dataset_name": "amttl"}}, "tags": ["task_categories:token-classification", "task_ids:parsing", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "app_reviews": {"dataset_name": "app_reviews", "description": "It is a large dataset of Android applications belonging to 23 differentapps categories, which provides an overview of the types of feedback users report on the apps and documents the evolution of the related code metrics. The dataset contains about 395 applications of the F-Droid repository, including around 600 versions, 280,000 user reviews (extracted with specific text mining approaches)", "downloads": 3350, "configs": {"default": {"config_name": "default", "sample_row": "{\"package_name\": \"\\\"com.mantz_it.rfanalyzer\\\"\", \"review\": \"\\\"Great app! The new version now works on my Bravia...\", \"date\": \"\\\"October 12 2016\\\"\", \"star\": \"4\"}", "columns": ["package_name", "review", "date", "star"], "columns_mapping": {"package_name": "package_name", "review": "review", "date": "date", "star": "star"}, "dataset_description": "It is a large dataset of Android applications belonging to 23 differentapps categories, which provides an overview of the types of feedback users report on the apps and documents the evolution of the related code metrics. The dataset contains about 395 applications of the F-Droid repository, including around 600 versions, 280,000 user reviews (extracted with specific text mining approaches)\n", "dataset_name": "app_reviews"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:sentiment-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bigIR/ar_cov19": {"dataset_name": "bigIR/ar_cov19", "description": "ArCOV-19 is an Arabic COVID-19 Twitter dataset that covers the period from 27th of January till 30th of April 2020. ArCOV-19 is designed to enable research under several domains including natural language processing, information retrieval, and social computing, among others", "downloads": 345, "configs": {"ar_cov19": {"config_name": "ar_cov19", "sample_row": "{\"tweetID\": \"\\\"1221583597573824515\\\"\"}", "columns": ["tweetID"], "columns_mapping": {"tweetID": "tweetID"}, "dataset_description": "ArCOV-19 is an Arabic COVID-19 Twitter dataset that covers the period from 27th of January till 30th of April 2020. ArCOV-19 is designed to enable research under several domains including natural language processing, information retrieval, and social computing, among others\n", "dataset_name": "bigIR/ar_cov19"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:ar", "data-mining"], "is_gated": false}, "ar_res_reviews": {"dataset_name": "ar_res_reviews", "description": "Dataset of 8364 restaurant reviews scrapped from qaym.com in Arabic for sentiment analysis", "downloads": 377, "configs": {"default": {"config_name": "default", "sample_row": "{\"polarity\": \"0\", \"text\": \"\\\"\\\\u0627\\\\u0648\\\\u0644\\\\u0627: \\\\u0627\\\\u0644\\\\u0645\\\\u064...\", \"restaurant_id\": \"\\\"296\\\"\", \"user_id\": \"\\\"423\\\"\"}", "columns": ["polarity", "text", "restaurant_id", "user_id"], "columns_mapping": {"polarity": "polarity", "text": "text", "restaurant_id": "restaurant_id", "user_id": "user_id"}, "dataset_description": "Dataset of 8364 restaurant reviews scrapped from qaym.com in Arabic for sentiment analysis\n", "dataset_name": "ar_res_reviews"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "ar_sarcasm": {"dataset_name": "ar_sarcasm", "description": "ArSarcasm is a new Arabic sarcasm detection dataset.\nThe dataset was created using previously available Arabic sentiment analysis datasets (SemEval 2017 and ASTD)\n and adds sarcasm and dialect labels to them. The dataset contains 10,547 tweets, 1,682 (16%) of which are sarcastic.", "downloads": 417, "configs": {"default": {"config_name": "default", "sample_row": "{\"dialect\": \"1\", \"sarcasm\": \"0\", \"sentiment\": \"0\", \"original_sentiment\": \"0\", \"tweet\": \"\\\"\\\\u0646\\\\u0635\\\\u064a\\\\u062d\\\\u0647 \\\\u0645\\\\u0627 \\\\u063...\", \"source\": \"\\\"semeval\\\"\"}", "columns": ["dialect", "sarcasm", "sentiment", "original_sentiment", "tweet", "source"], "columns_mapping": {"dialect": "dialect", "sarcasm": "sarcasm", "sentiment": "sentiment", "original_sentiment": "original_sentiment", "tweet": "tweet", "source": "source"}, "dataset_description": "ArSarcasm is a new Arabic sarcasm detection dataset.\nThe dataset was created using previously available Arabic sentiment analysis datasets (SemEval 2017 and ASTD)\n and adds sarcasm and dialect labels to them. The dataset contains 10,547 tweets, 1,682 (16%) of which are sarcastic.\n", "dataset_name": "ar_sarcasm"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended|other-semeval_2017", "source_datasets:extended|other-astd", "language:ar", "sarcasm-detection"], "is_gated": false}, "arabic_pos_dialect": {"dataset_name": "arabic_pos_dialect", "description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.", "downloads": 878, "configs": {"egy": {"config_name": "egy", "sample_row": "{\"fold\": \"4\", \"subfold\": \"\\\"A\\\"\", \"words\": \"[\\\"\\\\u0644\\\\u064a\\\\u0647\\\", \\\"\\\\u0644\\\\u0645\\\\u0627\\\", \\\"\\\\u06...\", \"segments\": \"[\\\"\\\\u0644\\\\u064a\\\\u0647\\\", \\\"\\\\u0644\\\\u0645\\\\u0627\\\", \\\"\\\\u06...\", \"pos_tags\": \"[\\\"PART\\\", \\\"PART\\\", \\\"V\\\", \\\"NOUN\\\", \\\"PREP\\\", \\\"NOUN+PRON\\\",...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}, "lev": {"config_name": "lev", "sample_row": "{\"fold\": \"2\", \"subfold\": \"\\\"B\\\"\", \"words\": \"[\\\"\\\\u0642\\\\u0627\\\\u0639\\\\u062f\\\", \\\"\\\\u0639\\\\u0645\\\", \\\"\\\\u06...\", \"segments\": \"[\\\"\\\\u0642\\\\u0627\\\\u0639\\\\u062f\\\", \\\"\\\\u0639\\\\u0645\\\", \\\"\\\\u06...\", \"pos_tags\": \"[\\\"ADJ\\\", \\\"PART\\\", \\\"V\\\", \\\"NOUN+PRON\\\", \\\"PREP+DET+ADJ\\\", ...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}, "glf": {"config_name": "glf", "sample_row": "{\"fold\": \"2\", \"subfold\": \"\\\"B\\\"\", \"words\": \"[\\\"@tagimlm77\\\", \\\"@444Tf\\\", \\\"\\\\u0647\\\\u0648\\\", \\\"\\\\u062b\\\\u...\", \"segments\": \"[\\\"@tagimlm77\\\", \\\"@444Tf\\\", \\\"\\\\u0647\\\\u0648\\\", \\\"\\\\u062b\\\\u...\", \"pos_tags\": \"[\\\"MENTION\\\", \\\"MENTION\\\", \\\"PRON\\\", \\\"NOUN\\\", \\\"ADJ\\\", \\\"NOU...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}, "mgr": {"config_name": "mgr", "sample_row": "{\"fold\": \"2\", \"subfold\": \"\\\"B\\\"\", \"words\": \"[\\\"0.7\\\", \\\"\\\\u062f\\\\u064a\\\\u0627\\\\u0644\\\", \\\"\\\\u0627\\\\u0644\\\\...\", \"segments\": \"[\\\"0.7\\\", \\\"\\\\u062f\\\\u064a\\\\u0627\\\\u0644\\\", \\\"\\\\u0627\\\\u0644+...\", \"pos_tags\": \"[\\\"NUM\\\", \\\"PREP\\\", \\\"DET+NOUN+NSUFF\\\", \\\"PART\\\", \\\"V+PRON\\\"...\"}", "columns": ["fold", "subfold", "words", "segments", "pos_tags"], "columns_mapping": {"fold": "fold", "subfold": "subfold", "words": "words", "segments": "segments", "pos_tags": "pos_tags"}, "dataset_description": "The Dialectal Arabic Datasets contain four dialects of Arabic, Etyptian (EGY), Levantine (LEV), Gulf (GLF), and Maghrebi (MGR). Each dataset consists of a set of 350 manually segmented and POS tagged tweets.\n", "dataset_name": "arabic_pos_dialect"}}, "tags": ["task_categories:token-classification", "task_ids:part-of-speech", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:extended", "language:ar"], "is_gated": false}, "arcd": {"dataset_name": "arcd", "description": " Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles.", "downloads": 497, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"\\\"969331847966\\\"\", \"title\": \"\\\"\\\\u062c\\\\u0645\\\\u0627\\\\u0644 \\\\u062e\\\\u0627\\\\u0634\\\\u0642...\", \"context\": \"\\\"\\\\u062c\\\\u0645\\\\u0627\\\\u0644 \\\\u0623\\\\u062d\\\\u0645\\\\u062f...\", \"question\": \"\\\"- \\\\u0645\\\\u0646 \\\\u0647\\\\u0648 \\\\u062c\\\\u0645\\\\u0627\\\\u0...\", \"answers.text\": \"[\\\"\\\\u0635\\\\u062d\\\\u0641\\\\u064a \\\\u0648\\\\u0625\\\\u0639\\\\u064...\", \"answers.answer_start\": \"[73]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": " Arabic Reading Comprehension Dataset (ARCD) composed of 1,395 questions posed by crowdworkers on Wikipedia articles.\n", "dataset_name": "arcd"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "art": {"dataset_name": "art", "description": "the Abductive Natural Language Inference Dataset from AI2", "downloads": 541, "configs": {"anli": {"config_name": "anli", "sample_row": "{\"observation_1\": \"\\\"Chad went to get the wheel alignment measured on ...\", \"observation_2\": \"\\\"The mechanic provided a working alignment with ne...\", \"hypothesis_1\": \"\\\"Chad was waiting for his car to be washed.\\\"\", \"hypothesis_2\": \"\\\"Chad was waiting for his car to be finished.\\\"\", \"label\": \"2\"}", "columns": ["observation_1", "observation_2", "hypothesis_1", "hypothesis_2", "label"], "columns_mapping": {"observation_1": "observation_1", "observation_2": "observation_2", "hypothesis_1": "hypothesis_1", "hypothesis_2": "hypothesis_2", "label": "label"}, "dataset_description": "the Abductive Natural Language Inference Dataset from AI2\n", "dataset_name": "art"}}, "tags": ["task_categories:multiple-choice", "task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "abductive-natural-language-inference"], "is_gated": false}, "ascent_kb": {"dataset_name": "ascent_kb", "description": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline (https://ascent.mpi-inf.mpg.de/).", "downloads": 513, "configs": {"canonical": {"config_name": "canonical", "sample_row": "{\"arg1\": \"\\\"aa\\\"\", \"rel\": \"\\\"/r/HasProperty\\\"\", \"arg2\": \"\\\"immunohistochemistry staining\\\"\", \"support\": \"1\", \"facets\": \"[]\", \"source_sentences\": \"[{\\\"text\\\": \\\"AA can be identified by immunohistochem...\"}", "columns": ["arg1", "rel", "arg2", "support", "facets", "source_sentences"], "columns_mapping": {"arg1": "arg1", "rel": "rel", "arg2": "arg2", "support": "support", "facets": "facets", "source_sentences": "source_sentences"}, "dataset_description": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline (https://ascent.mpi-inf.mpg.de/).\n", "dataset_name": "ascent_kb"}, "open": {"config_name": "open", "sample_row": "{\"subject\": \"\\\"aa\\\"\", \"predicate\": \"\\\"be identified by\\\"\", \"object\": \"\\\"immunohistochemistry staining\\\"\", \"support\": \"1\", \"facets\": \"[]\", \"source_sentences\": \"[{\\\"text\\\": \\\"AA can be identified by immunohistochem...\"}", "columns": ["subject", "predicate", "object", "support", "facets", "source_sentences"], "columns_mapping": {"subject": "subject", "predicate": "predicate", "object": "object", "support": "support", "facets": "facets", "source_sentences": "source_sentences"}, "dataset_description": "This dataset contains 8.9M commonsense assertions extracted by the Ascent pipeline (https://ascent.mpi-inf.mpg.de/).\n", "dataset_name": "ascent_kb"}}, "tags": ["task_categories:other", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "knowledge-base"], "is_gated": false}, "aslg_pc12": {"dataset_name": "aslg_pc12", "description": "A large synthetic collection of parallel English and ASL-Gloss texts.\nThere are two string features: text, and gloss.", "downloads": 457, "configs": {"default": {"config_name": "default", "sample_row": "{\"gloss\": \"\\\"\\\\ufeffMEMBERSHIP PARLIAMENT SEE MINUTE\\\\n\\\"\", \"text\": \"\\\"\\\\ufeffmembership of parliament see minutes\\\\n\\\"\"}", "columns": ["gloss", "text"], "columns_mapping": {"gloss": "gloss", "text": "text"}, "dataset_description": "A large synthetic collection of parallel English and ASL-Gloss texts.\nThere are two string features: text, and gloss.\n", "dataset_name": "aslg_pc12"}}, "tags": ["task_categories:translation", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:ase", "language:en"], "is_gated": false}, "asnq": {"dataset_name": "asnq", "description": "ASNQ is a dataset for answer sentence selection derived from\nGoogle's Natural Questions (NQ) dataset (Kwiatkowski et al. 2019).\n\nEach example contains a question, candidate sentence, label indicating whether or not\nthe sentence answers the question, and two additional features --\nsentence_in_long_answer and short_answer_in_sentence indicating whether ot not the\ncandidate sentence is contained in the long_answer and if the short_answer is in the candidate sentence.\n\nFor more details please see\nhttps://arxiv.org/pdf/1911.04118.pdf\n\nand\n\nhttps://research.google/pubs/pub47761/", "downloads": 409, "configs": {"default": {"config_name": "default", "sample_row": "{\"question\": \"\\\"what is the use of fn key in mac\\\"\", \"sentence\": \"\\\"It is typically found on laptops due to their key...\", \"label\": \"0\", \"sentence_in_long_answer\": \"false\", \"short_answer_in_sentence\": \"false\"}", "columns": ["question", "sentence", "label", "sentence_in_long_answer", "short_answer_in_sentence"], "columns_mapping": {"question": "question", "sentence": "sentence", "label": "label", "sentence_in_long_answer": "sentence_in_long_answer", "short_answer_in_sentence": "short_answer_in_sentence"}, "dataset_description": "ASNQ is a dataset for answer sentence selection derived from\nGoogle's Natural Questions (NQ) dataset (Kwiatkowski et al. 2019).\n\nEach example contains a question, candidate sentence, label indicating whether or not\nthe sentence answers the question, and two additional features --\nsentence_in_long_answer and short_answer_in_sentence indicating whether ot not the\ncandidate sentence is contained in the long_answer and if the short_answer is in the candidate sentence.\n\nFor more details please see\nhttps://arxiv.org/pdf/1911.04118.pdf\n\nand\n\nhttps://research.google/pubs/pub47761/\n", "dataset_name": "asnq"}}, "tags": ["task_categories:multiple-choice", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|natural_questions", "language:en"], "is_gated": false}, "assin": {"dataset_name": "assin", "description": "The ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.", "downloads": 869, "configs": {"full": {"config_name": "full", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"A gente faz o aporte financeiro, \\\\u00e9 como se a...\", \"hypothesis\": \"\\\"Fernando Moraes afirma que n\\\\u00e3o tem v\\\\u00ednc...\", \"relatedness_score\": \"2.0\", \"entailment_judgment\": \"0\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.\n", "dataset_name": "assin"}, "ptpt": {"config_name": "ptpt", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"Relembre-se que o atleta estava afastado dos relv...\", \"hypothesis\": \"\\\"Andr\\\\u00e9 Gomes entra em campo quatro meses depo...\", \"relatedness_score\": \"3.5\", \"entailment_judgment\": \"0\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.\n", "dataset_name": "assin"}, "ptbr": {"config_name": "ptbr", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"A gente faz o aporte financeiro, \\\\u00e9 como se a...\", \"hypothesis\": \"\\\"Fernando Moraes afirma que n\\\\u00e3o tem v\\\\u00ednc...\", \"relatedness_score\": \"2.0\", \"entailment_judgment\": \"0\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN (Avalia\u00e7\u00e3o de Similaridade Sem\u00e2ntica e INfer\u00eancia textual) corpus is a corpus annotated with pairs of sentences written in\nPortuguese that is suitable for the exploration of textual entailment and paraphrasing classifiers. The corpus contains pairs of sentences\nextracted from news articles written in European Portuguese (EP) and Brazilian Portuguese (BP), obtained from Google News Portugal\nand Brazil, respectively. To create the corpus, the authors started by collecting a set of news articles describing the\nsame event (one news article from Google News Portugal and another from Google News Brazil) from Google News.\nThen, they employed Latent Dirichlet Allocation (LDA) models to retrieve pairs of similar sentences between sets of news\narticles that were grouped together around the same topic. For that, two LDA models were trained (for EP and for BP)\non external and large-scale collections of unannotated news articles from Portuguese and Brazilian news providers, respectively.\nThen, the authors defined a lower and upper threshold for the sentence similarity score of the retrieved pairs of sentences,\ntaking into account that high similarity scores correspond to sentences that contain almost the same content (paraphrase candidates),\nand low similarity scores correspond to sentences that are very different in content from each other (no-relation candidates).\nFrom the collection of pairs of sentences obtained at this stage, the authors performed some manual grammatical corrections\nand discarded some of the pairs wrongly retrieved. Furthermore, from a preliminary analysis made to the retrieved sentence pairs\nthe authors noticed that the number of contradictions retrieved during the previous stage was very low. Additionally, they also\nnoticed that event though paraphrases are not very frequent, they occur with some frequency in news articles. Consequently,\nin contrast with the majority of the currently available corpora for other languages, which consider as labels \u201cneutral\u201d, \u201centailment\u201d\nand \u201ccontradiction\u201d for the task of RTE, the authors of the ASSIN corpus decided to use as labels \u201cnone\u201d, \u201centailment\u201d and \u201cparaphrase\u201d.\nFinally, the manual annotation of pairs of sentences was performed by human annotators. At least four annotators were randomly\nselected to annotate each pair of sentences, which is done in two steps: (i) assigning a semantic similarity label (a score between 1 and 5,\nfrom unrelated to very similar); and (ii) providing an entailment label (one sentence entails the other, sentences are paraphrases,\nor no relation). Sentence pairs where at least three annotators do not agree on the entailment label were considered controversial\nand thus discarded from the gold standard annotations. The full dataset has 10,000 sentence pairs, half of which in Brazilian Portuguese\nand half in European Portuguese. Either language variant has 2,500 pairs for training, 500 for validation and 2,000 for testing.\n", "dataset_name": "assin"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:natural-language-inference", "task_ids:semantic-similarity-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "assin2": {"dataset_name": "assin2", "description": "The ASSIN 2 corpus is composed of rather simple sentences. Following the procedures of SemEval 2014 Task 1.\nThe training and validation data are composed, respectively, of 6,500 and 500 sentence pairs in Brazilian Portuguese,\nannotated for entailment and semantic similarity. Semantic similarity values range from 1 to 5, and text entailment\nclasses are either entailment or none. The test data are composed of approximately 3,000 sentence pairs with the same\nannotation. All data were manually annotated.", "downloads": 1522, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence_pair_id\": \"1\", \"premise\": \"\\\"Uma crian\\\\u00e7a risonha est\\\\u00e1 segurando uma ...\", \"hypothesis\": \"\\\"Uma crian\\\\u00e7a est\\\\u00e1 segurando uma pistola ...\", \"relatedness_score\": \"4.5\", \"entailment_judgment\": \"1\"}", "columns": ["sentence_pair_id", "premise", "hypothesis", "relatedness_score", "entailment_judgment"], "columns_mapping": {"sentence_pair_id": "sentence_pair_id", "premise": "premise", "hypothesis": "hypothesis", "relatedness_score": "relatedness_score", "entailment_judgment": "entailment_judgment"}, "dataset_description": "\nThe ASSIN 2 corpus is composed of rather simple sentences. Following the procedures of SemEval 2014 Task 1.\nThe training and validation data are composed, respectively, of 6,500 and 500 sentence pairs in Brazilian Portuguese,\nannotated for entailment and semantic similarity. Semantic similarity values range from 1 to 5, and text entailment\nclasses are either entailment or none. The test data are composed of approximately 3,000 sentence pairs with the same\nannotation. All data were manually annotated.\n", "dataset_name": "assin2"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:natural-language-inference", "task_ids:semantic-similarity-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "facebook/babi_qa": {"dataset_name": "facebook/babi_qa", "description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.", "downloads": 1945, "configs": {"en-qa1": {"config_name": "en-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "hn-qa1": {"config_name": "hn-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Sita gusalkhaney mein gayi.\\\", \\\"Priya sayanakaksh...\", \"story.supporting_ids\": \"[[], [], [\\\"2\\\"], [], [], [\\\"5\\\"], [], [], [\\\"7\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"sayanakaksh\\\", \\\"\\\", \\\"\\\", \\\"rasoi ghar\\\", \\\"\\\", ...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "en-10k-qa1": {"config_name": "en-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "en-valid-qa1": {"config_name": "en-valid-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "en-valid-10k-qa1": {"config_name": "en-valid-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Mary moved to the bathroom.\\\", \\\"John went to the ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"bathroom\\\", \\\"\\\", \\\"\\\", \\\"hallway\\\", \\\"\\\", \\\"\\\", \\\"h...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "hn-10k-qa1": {"config_name": "hn-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Sita gusalkhaney mein gayi.\\\", \\\"Priya sayanakaksh...\", \"story.supporting_ids\": \"[[], [], [\\\"2\\\"], [], [], [\\\"5\\\"], [], [], [\\\"7\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"sayanakaksh\\\", \\\"\\\", \\\"\\\", \\\"rasoi ghar\\\", \\\"\\\", ...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "shuffled-qa1": {"config_name": "shuffled-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Utxi ybnha qb qzh ptqzxbby.\\\", \\\"Hbzm jhmq qb qzh ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"ptqzxbby\\\", \\\"\\\", \\\"\\\", \\\"ztuujti\\\", \\\"\\\", \\\"\\\", \\\"z...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}, "shuffled-10k-qa1": {"config_name": "shuffled-10k-qa1", "sample_row": "{\"story.id\": \"[\\\"1\\\", \\\"2\\\", \\\"3\\\", \\\"4\\\", \\\"5\\\", \\\"6\\\", \\\"7\\\", \\\"8\\\", \\\"9\\\", \\\"10\\\"...\", \"story.type\": \"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1]\", \"story.text\": \"[\\\"Utxi ybnha qb qzh ptqzxbby.\\\", \\\"Hbzm jhmq qb qzh ...\", \"story.supporting_ids\": \"[[], [], [\\\"1\\\"], [], [], [\\\"4\\\"], [], [], [\\\"4\\\"], [], ...\", \"story.answer\": \"[\\\"\\\", \\\"\\\", \\\"ptqzxbby\\\", \\\"\\\", \\\"\\\", \\\"ztuujti\\\", \\\"\\\", \\\"\\\", \\\"z...\"}", "columns": ["story_id", "story_type", "story_text", "story_supporting_ids", "story_answer"], "columns_mapping": {"story.id": "story_id", "story.type": "story_type", "story.text": "story_text", "story.supporting_ids": "story_supporting_ids", "story.answer": "story_answer"}, "dataset_description": "The (20) QA bAbI tasks are a set of proxy tasks that evaluate reading\ncomprehension via question answering. Our tasks measure understanding\nin several ways: whether a system is able to answer questions via chaining facts,\nsimple induction, deduction and many more. The tasks are designed to be prerequisites\nfor any system that aims to be capable of conversing with a human.\nThe aim is to classify these tasks into skill sets,so that researchers\ncan identify (and then rectify)the failings of their systems.\n", "dataset_name": "facebook/babi_qa"}}, "tags": ["task_categories:question-answering", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "chained-qa"], "is_gated": false}, "banking77": {"dataset_name": "banking77", "description": "BANKING77 dataset provides a very fine-grained set of intents in a banking domain.\nIt comprises 13,083 customer service queries labeled with 77 intents.\nIt focuses on fine-grained single-domain intent detection.", "downloads": 5093, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"I am still waiting on my card?\\\"\", \"label\": \"11\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "BANKING77 dataset provides a very fine-grained set of intents in a banking domain.\nIt comprises 13,083 customer service queries labeled with 77 intents.\nIt focuses on fine-grained single-domain intent detection.\n", "dataset_name": "banking77"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bbaw_egyptian": {"dataset_name": "bbaw_egyptian", "description": "This dataset comprises parallel sentences of hieroglyphic encodings, transcription and translation\nas used in the paper Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian\nHieroglyph. The data triples are extracted from the digital corpus of Egyptian texts compiled by\nthe project \"Strukturen und Transformationen des Wortschatzes der \u00e4gyptischen Sprache\".", "downloads": 344, "configs": {"default": {"config_name": "default", "sample_row": "{\"transcription\": \"\\\"\\\\u2e22p\\\\u1e0f,wt-9\\\\u2e23 n =f [\\\\u2e2e\\\\u1e25tr...\", \"translation\": \"\\\"... die Neun-Bogenv\\\\u00f6lker ... zu ihm ... Pfer...\", \"hieroglyphs\": \"\\\"\\\"\"}", "columns": ["transcription", "translation", "hieroglyphs"], "columns_mapping": {"transcription": "transcription", "translation": "translation", "hieroglyphs": "hieroglyphs"}, "dataset_description": "This dataset comprises parallel sentences of hieroglyphic encodings, transcription and translation\nas used in the paper Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian\nHieroglyph. The data triples are extracted from the digital corpus of Egyptian texts compiled by\nthe project \"Strukturen und Transformationen des Wortschatzes der \u00e4gyptischen Sprache\".\n", "dataset_name": "bbaw_egyptian"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:extended|wikipedia", "language:de", "language:egy", "language:en"], "is_gated": false}, "bbc_hindi_nli": {"dataset_name": "bbc_hindi_nli", "description": "This dataset is used to train models for Natural Language Inference Tasks in Low-Resource Languages like Hindi.", "downloads": 380, "configs": {"bbc hindi nli": {"config_name": "bbc hindi nli", "sample_row": "{\"premise\": \"\\\"\\\\u0917\\\\u094b\\\\u092a\\\\u0928\\\\u0940\\\\u092f\\\\u0924\\\\u093e ...\", \"hypothesis\": \"\\\"\\\\u092f\\\\u0939 \\\\u0916\\\\u092c\\\\u0930 \\\\u0915\\\\u0940 \\\\u09...\", \"label\": \"1\", \"topic\": \"1\"}", "columns": ["premise", "hypothesis", "label", "topic"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "topic": "topic"}, "dataset_description": "This dataset is used to train models for Natural Language Inference Tasks in Low-Resource Languages like Hindi.\n", "dataset_name": "bbc_hindi_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|bbc__hindi_news_classification", "language:hi"], "is_gated": false}, "bc2gm_corpus": {"dataset_name": "bc2gm_corpus", "description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\n\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\n\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll", "downloads": 670, "configs": {"bc2gm_corpus": {"config_name": "bc2gm_corpus", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Comparison\\\", \\\"with\\\", \\\"alkaline\\\", \\\"phosphatases\\\",...\", \"ner_tags\": \"[0, 0, 1, 2, 0, 1, 2, 2]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "Nineteen teams presented results for the Gene Mention Task at the BioCreative II Workshop.\nIn this task participants designed systems to identify substrings in sentences corresponding to gene name mentions.\nA variety of different methods were used and the results varied with a highest achieved F1 score of 0.8721.\nHere we present brief descriptions of all the methods used and a statistical analysis of the results.\nWe also demonstrate that, by combining the results from all submissions, an F score of 0.9066 is feasible,\nand furthermore that the best result makes use of the lowest scoring submissions.\n\nFor more details, see: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2559986/\n\nThe original dataset can be downloaded from: https://biocreative.bioinformatics.udel.edu/resources/corpora/biocreative-ii-corpus/\nThis dataset has been converted to CoNLL format for NER using the following tool: https://github.com/spyysalo/standoff2conll\n", "dataset_name": "bc2gm_corpus"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "best2009": {"dataset_name": "best2009", "description": "`best2009` is a Thai word-tokenization dataset from encyclopedia, novels, news and articles by\n[NECTEC](https://www.nectec.or.th/) (148,995/2,252 lines of train/test). It was created for\n[BEST 2010: Word Tokenization Competition](https://thailang.nectec.or.th/archive/indexa290.html?q=node/10).\nThe test set answers are not provided publicly.", "downloads": 335, "configs": {"best2009": {"config_name": "best2009", "sample_row": "{\"fname\": \"\\\"article_00001.txt\\\"\", \"char\": \"[\\\"\\\\u0e01\\\", \\\"\\\\u0e0e\\\", \\\"\\\\u0e2b\\\", \\\"\\\\u0e21\\\", \\\"\\\\u0e32\\\",...\", \"char_type\": \"[1, 1, 3, 1, 10, 1, 1, 4, 1, 1, 10, 1, 11, 1, 10, ...\", \"is_beginning\": \"[1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0...\"}", "columns": ["fname", "char", "char_type", "is_beginning"], "columns_mapping": {"fname": "fname", "char": "char", "char_type": "char_type", "is_beginning": "is_beginning"}, "dataset_description": "`best2009` is a Thai word-tokenization dataset from encyclopedia, novels, news and articles by\n[NECTEC](https://www.nectec.or.th/) (148,995/2,252 lines of train/test). It was created for\n[BEST 2010: Word Tokenization Competition](https://thailang.nectec.or.th/archive/indexa290.html?q=node/10).\nThe test set answers are not provided publicly.\n", "dataset_name": "best2009"}}, "tags": ["task_categories:token-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:th", "word-tokenization"], "is_gated": false}, "bianet": {"dataset_name": "bianet", "description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M", "downloads": 669, "configs": {"en_to_ku": {"config_name": "en_to_ku", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Diyarbak\\\\u0131r 2nd Criminal Court of Peace has i...\", \"translation.ku\": \"\\\"Biryara qedexekirin\\\\u00ea di r\\\\u00fbpela Lijneya ...\"}", "columns": ["id", "translation_en", "translation_ku"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.ku": "translation_ku"}, "dataset_description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M\n", "dataset_name": "bianet"}, "en_to_tr": {"config_name": "en_to_tr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"The members of FEMEN stripped their tops in a sch...\", \"translation.tr\": \"\\\"FEMEN \\\\u00fcyeleri \\\\u00dcsk\\\\u00fcdar'daki bir oku...\"}", "columns": ["id", "translation_en", "translation_tr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.tr": "translation_tr"}, "dataset_description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M\n", "dataset_name": "bianet"}, "ku_to_tr": {"config_name": "ku_to_tr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.ku\": \"\\\"Biryara qedexekirin\\\\u00ea di r\\\\u00fbpela Lijneya ...\", \"translation.tr\": \"\\\"Karar\\\\u0131 duyuran Radyo ve Televizyon \\\\u00dcst ...\"}", "columns": ["id", "translation_ku", "translation_tr"], "columns_mapping": {"id": "id", "translation.ku": "translation_ku", "translation.tr": "translation_tr"}, "dataset_description": "A parallel news corpus in Turkish, Kurdish and English.\nBianet collects 3,214 Turkish articles with their sentence-aligned Kurdish or English translations from the Bianet online newspaper.\n3 languages, 3 bitexts\ntotal number of files: 6\ntotal number of tokens: 2.25M\ntotal number of sentence fragments: 0.14M\n", "dataset_name": "bianet"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:en", "language:ku", "language:tr"], "is_gated": false}, "bible_para": {"dataset_name": "bible_para", "description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M", "downloads": 1193, "configs": {"de-en": {"config_name": "de-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Am Anfang schuf Gott Himmel und Erde.\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\"}", "columns": ["id", "translation_de", "translation_en"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.en": "translation_en"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.fr\": \"\\\"Au commencement, Dieu cr\\\\u00e9a les cieux et la t...\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-es": {"config_name": "en-es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.es\": \"\\\"En el principio cre\\\\u00f3 Dios los cielos y la ti...\"}", "columns": ["id", "translation_en", "translation_es"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.es": "translation_es"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-fi": {"config_name": "en-fi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.fi\": \"\\\"Alussa loi Jumala taivaan ja maan.\\\"\"}", "columns": ["id", "translation_en", "translation_fi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fi": "translation_fi"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-no": {"config_name": "en-no", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.no\": \"\\\"I begynnelsen skapte Gud himmelen og jorden.\\\"\"}", "columns": ["id", "translation_en", "translation_no"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.no": "translation_no"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}, "en-hi": {"config_name": "en-hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"In the beginning God created the heavens and the ...\", \"translation.hi\": \"\\\"\\\\u0906\\\\u0926\\\\u093f \\\\u092e\\\\u0947\\\\u0902 \\\\u092a\\\\u093...\"}", "columns": ["id", "translation_en", "translation_hi"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.hi": "translation_hi"}, "dataset_description": "This is a multilingual parallel corpus created from translations of the Bible compiled by Christos Christodoulopoulos and Mark Steedman.\n\n102 languages, 5,148 bitexts\ntotal number of files: 107\ntotal number of tokens: 56.43M\ntotal number of sentence fragments: 2.84M\n", "dataset_name": "bible_para"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:acu", "language:af", "language:agr", "language:ake", "language:am", "language:amu", "language:ar", "language:bg", "language:bsn", "language:cak", "language:ceb", "language:ch", "language:chq", "language:chr", "language:cjp", "language:cni", "language:cop", "language:crp", "language:cs", "language:da", "language:de", "language:dik", "language:dje", "language:djk", "language:dop", "language:ee", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fi", "language:fr", "language:gbi", "language:gd", "language:gu", "language:gv", "language:he", "language:hi", "language:hr", "language:hu", "language:hy", "language:id", "language:is", "language:it", "language:ja", "language:jak", "language:jiv", "language:kab", "language:kbh", "language:kek", "language:kn", "language:ko", "language:la", "language:lt", "language:lv", "language:mam", "language:mi", "language:ml", "language:mr", "language:my", "language:ne", "language:nhg", "language:nl", "language:no", "language:ojb", "language:pck", "language:pes", "language:pl", "language:plt", "language:pot", "language:ppk", "language:pt", "language:quc", "language:quw", "language:ro", "language:rom", "language:ru", "language:shi", "language:sk", "language:sl", "language:sn", "language:so", "language:sq", "language:sr", "language:ss", "language:sv", "language:syr", "language:te", "language:th", "language:tl", "language:tmh", "language:tr", "language:uk", "language:usp", "language:vi", "language:wal", "language:wo", "language:xh", "language:zh", "language:zu"], "is_gated": false}, "big_patent": {"dataset_name": "big_patent", "description": "BIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.", "downloads": 2904, "configs": {"all": {"config_name": "all", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n [0001] This ...\", \"abstract\": \"\\\"This invention relates to novel calcium phosphate...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "a": {"config_name": "a", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n [0001] This ...\", \"abstract\": \"\\\"This invention relates to novel calcium phosphate...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "b": {"config_name": "b", "sample_row": "{\"description\": \"\\\"BACKGROUND OF THE INVENTION \\\\n [0001] ...\", \"abstract\": \"\\\"A releasable fastener for an album to permit inse...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "c": {"config_name": "c", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n The present inventi...\", \"abstract\": \"\\\"The invention concerns a polypeptide selected fro...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "d": {"config_name": "d", "sample_row": "{\"description\": \"\\\"BACKGROUND OF THE INVENTION \\\\n This invention...\", \"abstract\": \"\\\"A method of forming fiber mixtures from different...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "e": {"config_name": "e", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n This invention broa...\", \"abstract\": \"\\\"A method and apparatus for achieving adiabatic he...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "f": {"config_name": "f", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n The present inventi...\", \"abstract\": \"\\\"A range for a recreational vehicle which is adapt...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "g": {"config_name": "g", "sample_row": "{\"description\": \"\\\"CROSS-REFERENCE TO RELATED APPLICATIONS \\\\n ...\", \"abstract\": \"\\\"Methods and systems are provided for obtaining in...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "h": {"config_name": "h", "sample_row": "{\"description\": \"\\\"BACKGROUND OF THE INVENTION \\\\n 1. Field of th...\", \"abstract\": \"\\\"A field programmable gate array (FPGA) with pass ...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}, "y": {"config_name": "y", "sample_row": "{\"description\": \"\\\"FIELD OF THE INVENTION \\\\n This invention rela...\", \"abstract\": \"\\\"A camouflage wrapping strip that takes the form o...\"}", "columns": ["description", "abstract"], "columns_mapping": {"description": "description", "abstract": "abstract"}, "dataset_description": "\nBIGPATENT, consisting of 1.3 million records of U.S. patent documents\nalong with human written abstractive summaries.\nEach US patent application is filed under a Cooperative Patent Classification\n(CPC) code. There are nine such classification categories:\nA (Human Necessities), B (Performing Operations; Transporting),\nC (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),\nF (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),\nG (Physics), H (Electricity), and\nY (General tagging of new or cross-sectional technology)\nThere are two features:\n - description: detailed description of patent.\n - abstract: Patent abastract.\n", "dataset_name": "big_patent"}}, "tags": ["task_categories:summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en", "patent-summarization"], "is_gated": false}, "biosses": {"dataset_name": "biosses", "description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation. The dataset comprises 100 sentence pairs, in which each sentence was selected from the TAC (Text Analysis Conference) Biomedical Summarization Track Training Dataset containing articles from the biomedical domain. The sentence pairs were evaluated by five different human experts that judged their similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).", "downloads": 1330, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence1\": \"\\\"Here, looking for agents that could specifically ...\", \"sentence2\": \"\\\"Not surprisingly, GATA2 knockdown in KRAS mutant ...\", \"score\": \"2.2\"}", "columns": ["sentence1", "sentence2", "score"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "score": "score"}, "dataset_description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation. The dataset comprises 100 sentence pairs, in which each sentence was selected from the TAC (Text Analysis Conference) Biomedical Summarization Track Training Dataset containing articles from the biomedical domain. The sentence pairs were evaluated by five different human experts that judged their similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).\n", "dataset_name": "biosses"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:semantic-similarity-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "blended_skill_talk": {"dataset_name": "blended_skill_talk", "description": "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.", "downloads": 615, "configs": {"default": {"config_name": "default", "sample_row": "{\"personas\": \"[\\\"i've 2 kids.\\\", \\\"i love flowers.\\\"]\", \"additional_context\": \"\\\"\\\"\", \"previous_utterance\": \"[\\\"I love live music, that's why I try to go to con...\", \"context\": \"\\\"empathetic_dialogues\\\"\", \"free_messages\": \"[\\\"I like acting, I hope to be an actor, what about...\", \"guided_messages\": \"[\\\"that is ok. have any kids?\\\", \\\"that is good. I h...\", \"suggestions.convai2\": \"[\\\"i love acting ! i'll be famous someday . what do...\", \"suggestions.empathetic_dialogues\": \"[\\\"Any favorite actors?\\\", \\\"One day.\\\", \\\"How long mus...\", \"suggestions.wizard_of_wikipedia\": \"[\\\"I would like to develop my acting skills. What a...\", \"guided_chosen_suggestions\": \"[\\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\", \\\"\\\"]\", \"label_candidates\": \"[]\"}", "columns": ["personas", "additional_context", "previous_utterance", "context", "free_messages", "guided_messages", "suggestions_convai2", "suggestions_empathetic_dialogues", "suggestions_wizard_of_wikipedia", "guided_chosen_suggestions", "label_candidates"], "columns_mapping": {"personas": "personas", "additional_context": "additional_context", "previous_utterance": "previous_utterance", "context": "context", "free_messages": "free_messages", "guided_messages": "guided_messages", "suggestions.convai2": "suggestions_convai2", "suggestions.empathetic_dialogues": "suggestions_empathetic_dialogues", "suggestions.wizard_of_wikipedia": "suggestions_wizard_of_wikipedia", "guided_chosen_suggestions": "guided_chosen_suggestions", "label_candidates": "label_candidates"}, "dataset_description": "A dataset of 7k conversations explicitly designed to exhibit multiple conversation modes: displaying personality, having empathy, and demonstrating knowledge.\n", "dataset_name": "blended_skill_talk"}}, "tags": ["task_categories:conversational", "task_ids:dialogue-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "blimp": {"dataset_name": "blimp", "description": "BLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.", "downloads": 36055, "configs": {"adjunct_island": {"config_name": "adjunct_island", "sample_row": "{\"sentence_good\": \"\\\"Who should Derek hug after shocking Richard?\\\"\", \"sentence_bad\": \"\\\"Who should Derek hug Richard after shocking?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"adjunct_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "anaphor_gender_agreement": {"config_name": "anaphor_gender_agreement", "sample_row": "{\"sentence_good\": \"\\\"Katherine can't help herself.\\\"\", \"sentence_bad\": \"\\\"Katherine can't help himself.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"anaphor_agreement\\\"\", \"UID\": \"\\\"anaphor_gender_agreement\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "anaphor_number_agreement": {"config_name": "anaphor_number_agreement", "sample_row": "{\"sentence_good\": \"\\\"Susan revealed herself.\\\"\", \"sentence_bad\": \"\\\"Susan revealed themselves.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"anaphor_agreement\\\"\", \"UID\": \"\\\"anaphor_number_agreement\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "animate_subject_passive": {"config_name": "animate_subject_passive", "sample_row": "{\"sentence_good\": \"\\\"Amanda was respected by some waitresses.\\\"\", \"sentence_bad\": \"\\\"Amanda was respected by some picture.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"s-selection\\\"\", \"UID\": \"\\\"animate_subject_passive\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "animate_subject_trans": {"config_name": "animate_subject_trans", "sample_row": "{\"sentence_good\": \"\\\"Tina revealed Margaret.\\\"\", \"sentence_bad\": \"\\\"The horse revealed Margaret.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"s-selection\\\"\", \"UID\": \"\\\"animate_subject_trans\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "causative": {"config_name": "causative", "sample_row": "{\"sentence_good\": \"\\\"Aaron breaks the glass.\\\"\", \"sentence_bad\": \"\\\"Aaron appeared the glass.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"causative\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "complex_NP_island": {"config_name": "complex_NP_island", "sample_row": "{\"sentence_good\": \"\\\"Who aren't most hospitals that hadn't talked abou...\", \"sentence_bad\": \"\\\"Who aren't most waitresses alarming most hospital...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"complex_NP_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "coordinate_structure_constraint_complex_left_branch": {"config_name": "coordinate_structure_constraint_complex_left_branch", "sample_row": "{\"sentence_good\": \"\\\"What senators was Alicia approaching and some tea...\", \"sentence_bad\": \"\\\"What was Alicia approaching senators and some tea...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"coordinate_structure_constraint_complex_left_bran...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "coordinate_structure_constraint_object_extraction": {"config_name": "coordinate_structure_constraint_object_extraction", "sample_row": "{\"sentence_good\": \"\\\"Who were all men and Eric leaving?\\\"\", \"sentence_bad\": \"\\\"Who were all men leaving and Eric?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"coordinate_structure_constraint_object_extraction...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_1": {"config_name": "determiner_noun_agreement_1", "sample_row": "{\"sentence_good\": \"\\\"Raymond is selling this sketch.\\\"\", \"sentence_bad\": \"\\\"Raymond is selling this sketches.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_2": {"config_name": "determiner_noun_agreement_2", "sample_row": "{\"sentence_good\": \"\\\"Some dog stunned this committee.\\\"\", \"sentence_bad\": \"\\\"Some dog stunned these committee.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_irregular_1": {"config_name": "determiner_noun_agreement_irregular_1", "sample_row": "{\"sentence_good\": \"\\\"Laurie hasn't lifted those cacti.\\\"\", \"sentence_bad\": \"\\\"Laurie hasn't lifted those cactus.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_irregular_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_irregular_2": {"config_name": "determiner_noun_agreement_irregular_2", "sample_row": "{\"sentence_good\": \"\\\"All boys boast about that child.\\\"\", \"sentence_bad\": \"\\\"All boys boast about those child.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_irregular_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adj_2": {"config_name": "determiner_noun_agreement_with_adj_2", "sample_row": "{\"sentence_good\": \"\\\"Cynthia scans these hard books.\\\"\", \"sentence_bad\": \"\\\"Cynthia scans this hard books.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adj_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adj_irregular_1": {"config_name": "determiner_noun_agreement_with_adj_irregular_1", "sample_row": "{\"sentence_good\": \"\\\"Some waiters broke this lost foot.\\\"\", \"sentence_bad\": \"\\\"Some waiters broke this lost feet.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adj_irregular_1\\\"...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adj_irregular_2": {"config_name": "determiner_noun_agreement_with_adj_irregular_2", "sample_row": "{\"sentence_good\": \"\\\"Alexander didn't walk through that new oasis.\\\"\", \"sentence_bad\": \"\\\"Alexander didn't walk through those new oasis.\\\"...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adj_irregular_2\\\"...\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "determiner_noun_agreement_with_adjective_1": {"config_name": "determiner_noun_agreement_with_adjective_1", "sample_row": "{\"sentence_good\": \"\\\"Rebecca was criticizing those good documentaries....\", \"sentence_bad\": \"\\\"Rebecca was criticizing those good documentary.\\\"...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"determiner_noun_agreement\\\"\", \"UID\": \"\\\"determiner_noun_agreement_with_adjective_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "distractor_agreement_relational_noun": {"config_name": "distractor_agreement_relational_noun", "sample_row": "{\"sentence_good\": \"\\\"A niece of most senators hasn't descended most sl...\", \"sentence_bad\": \"\\\"A niece of most senators haven't descended most s...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"distractor_agreement_relational_noun\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "distractor_agreement_relative_clause": {"config_name": "distractor_agreement_relative_clause", "sample_row": "{\"sentence_good\": \"\\\"This customer who had visited most children has w...\", \"sentence_bad\": \"\\\"This customer who had visited most children have ...\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"distractor_agreement_relative_clause\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "drop_argument": {"config_name": "drop_argument", "sample_row": "{\"sentence_good\": \"\\\"Travis is touring.\\\"\", \"sentence_bad\": \"\\\"Travis is revealing.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"drop_argument\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "ellipsis_n_bar_1": {"config_name": "ellipsis_n_bar_1", "sample_row": "{\"sentence_good\": \"\\\"Dawn's ex-husband wasn't going to one rough groce...\", \"sentence_bad\": \"\\\"Dawn's ex-husband wasn't going to one grocery sto...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"ellipsis\\\"\", \"UID\": \"\\\"ellipsis_n_bar_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "ellipsis_n_bar_2": {"config_name": "ellipsis_n_bar_2", "sample_row": "{\"sentence_good\": \"\\\"A friend of Pamela hasn't attacked one person and...\", \"sentence_bad\": \"\\\"A friend of Pamela hasn't attacked one unemployed...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"ellipsis\\\"\", \"UID\": \"\\\"ellipsis_n_bar_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_object_raising": {"config_name": "existential_there_object_raising", "sample_row": "{\"sentence_good\": \"\\\"William has declared there to be no guests gettin...\", \"sentence_bad\": \"\\\"William has obliged there to be no guests getting...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"existential_there_object_raising\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_quantifiers_1": {"config_name": "existential_there_quantifiers_1", "sample_row": "{\"sentence_good\": \"\\\"There was a documentary about music irritating Al...\", \"sentence_bad\": \"\\\"There was each documentary about music irritating...\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"existential_there_quantifiers_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_quantifiers_2": {"config_name": "existential_there_quantifiers_2", "sample_row": "{\"sentence_good\": \"\\\"All convertibles weren't there existing.\\\"\", \"sentence_bad\": \"\\\"There weren't all convertibles existing.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"existential_there_quantifiers_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "existential_there_subject_raising": {"config_name": "existential_there_subject_raising", "sample_row": "{\"sentence_good\": \"\\\"There is soon to be a cat existing.\\\"\", \"sentence_bad\": \"\\\"There is willing to be a cat existing.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"existential_there_subject_raising\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "expletive_it_object_raising": {"config_name": "expletive_it_object_raising", "sample_row": "{\"sentence_good\": \"\\\"Tara would ascertain it to be noteworthy that Ken...\", \"sentence_bad\": \"\\\"Tara wouldn't entice it to be noteworthy that Ken...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"expletive_it_object_raising\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "inchoative": {"config_name": "inchoative", "sample_row": "{\"sentence_good\": \"\\\"Patricia had changed.\\\"\", \"sentence_bad\": \"\\\"Patricia had forgotten.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"inchoative\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "intransitive": {"config_name": "intransitive", "sample_row": "{\"sentence_good\": \"\\\"Todd can't yawn.\\\"\", \"sentence_bad\": \"\\\"Todd can't walk through.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"intransitive\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_past_participle_adjectives": {"config_name": "irregular_past_participle_adjectives", "sample_row": "{\"sentence_good\": \"\\\"The hidden offspring aren't confident.\\\"\", \"sentence_bad\": \"\\\"The hid offspring aren't confident.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"irregular_forms\\\"\", \"UID\": \"\\\"irregular_past_participle_adjectives\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_past_participle_verbs": {"config_name": "irregular_past_participle_verbs", "sample_row": "{\"sentence_good\": \"\\\"The Borgias wore a lot of scarves.\\\"\", \"sentence_bad\": \"\\\"The Borgias worn a lot of scarves.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"irregular_forms\\\"\", \"UID\": \"\\\"irregular_past_participle_verbs\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_plural_subject_verb_agreement_1": {"config_name": "irregular_plural_subject_verb_agreement_1", "sample_row": "{\"sentence_good\": \"\\\"Those radii have scared that teenager.\\\"\", \"sentence_bad\": \"\\\"Those radii has scared that teenager.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"irregular_plural_subject_verb_agreement_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "irregular_plural_subject_verb_agreement_2": {"config_name": "irregular_plural_subject_verb_agreement_2", "sample_row": "{\"sentence_good\": \"\\\"The women meet.\\\"\", \"sentence_bad\": \"\\\"The woman meet.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"irregular_plural_subject_verb_agreement_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "left_branch_island_echo_question": {"config_name": "left_branch_island_echo_question", "sample_row": "{\"sentence_good\": \"\\\"Irene had messed up whose rug?\\\"\", \"sentence_bad\": \"\\\"Whose had Irene messed up rug?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"left_branch_island_echo_question\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "left_branch_island_simple_question": {"config_name": "left_branch_island_simple_question", "sample_row": "{\"sentence_good\": \"\\\"Whose museums had Dana alarmed?\\\"\", \"sentence_bad\": \"\\\"Whose had Dana alarmed museums?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"left_branch_island_simple_question\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "matrix_question_npi_licensor_present": {"config_name": "matrix_question_npi_licensor_present", "sample_row": "{\"sentence_good\": \"\\\"Had Bruce ever played?\\\"\", \"sentence_bad\": \"\\\"Bruce had ever played.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"matrix_question_npi_licensor_present\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "npi_present_1": {"config_name": "npi_present_1", "sample_row": "{\"sentence_good\": \"\\\"Even Suzanne has really joked around.\\\"\", \"sentence_bad\": \"\\\"Even Suzanne has ever joked around.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"npi_present_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "npi_present_2": {"config_name": "npi_present_2", "sample_row": "{\"sentence_good\": \"\\\"Tamara really exited those mountains.\\\"\", \"sentence_bad\": \"\\\"Tamara ever exited those mountains.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"npi_present_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "only_npi_licensor_present": {"config_name": "only_npi_licensor_present", "sample_row": "{\"sentence_good\": \"\\\"Only Bill would ever complain.\\\"\", \"sentence_bad\": \"\\\"Even Bill would ever complain.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"only_npi_licensor_present\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "only_npi_scope": {"config_name": "only_npi_scope", "sample_row": "{\"sentence_good\": \"\\\"Only the grandsons of the Impressionists who Coll...\", \"sentence_bad\": \"\\\"The grandsons of the Impressionists who only Coll...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"only_npi_scope\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "passive_1": {"config_name": "passive_1", "sample_row": "{\"sentence_good\": \"\\\"Lucille's sisters are confused by Amy.\\\"\", \"sentence_bad\": \"\\\"Lucille's sisters are communicated by Amy.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"passive_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "passive_2": {"config_name": "passive_2", "sample_row": "{\"sentence_good\": \"\\\"A lot of nieces of some actor aren't scared.\\\"\", \"sentence_bad\": \"\\\"A lot of nieces of some actor aren't wept.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"passive_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_c_command": {"config_name": "principle_A_c_command", "sample_row": "{\"sentence_good\": \"\\\"A lot of patients who can sell some couch didn't ...\", \"sentence_bad\": \"\\\"A lot of patients who can sell some couch didn't ...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_c_command\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_case_1": {"config_name": "principle_A_case_1", "sample_row": "{\"sentence_good\": \"\\\"The teenagers explain that they aren't breaking a...\", \"sentence_bad\": \"\\\"The teenagers explain that themselves aren't brea...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_case_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_case_2": {"config_name": "principle_A_case_2", "sample_row": "{\"sentence_good\": \"\\\"Eric imagines himself taking every rug.\\\"\", \"sentence_bad\": \"\\\"Eric imagines himself took every rug.\\\"\", \"field\": \"\\\"syntax/semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_case_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_domain_1": {"config_name": "principle_A_domain_1", "sample_row": "{\"sentence_good\": \"\\\"Carla had explained that Samuel has discussed her...\", \"sentence_bad\": \"\\\"Carla had explained that Samuel has discussed her...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_domain_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_domain_2": {"config_name": "principle_A_domain_2", "sample_row": "{\"sentence_good\": \"\\\"Donald can imagine those college campuses are bor...\", \"sentence_bad\": \"\\\"Donald can imagine those college campuses are bor...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_domain_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_domain_3": {"config_name": "principle_A_domain_3", "sample_row": "{\"sentence_good\": \"\\\"Steven explains Kayla won't hurt herself.\\\"\", \"sentence_bad\": \"\\\"Kayla explains Steven won't hurt herself.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_domain_3\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "principle_A_reconstruction": {"config_name": "principle_A_reconstruction", "sample_row": "{\"sentence_good\": \"\\\"It's himself that this cashier attacked.\\\"\", \"sentence_bad\": \"\\\"It's himself that attacked this cashier.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"binding\\\"\", \"UID\": \"\\\"principle_A_reconstruction\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "regular_plural_subject_verb_agreement_1": {"config_name": "regular_plural_subject_verb_agreement_1", "sample_row": "{\"sentence_good\": \"\\\"Paula references Robert.\\\"\", \"sentence_bad\": \"\\\"Paula reference Robert.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"regular_plural_subject_verb_agreement_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "regular_plural_subject_verb_agreement_2": {"config_name": "regular_plural_subject_verb_agreement_2", "sample_row": "{\"sentence_good\": \"\\\"The students perform.\\\"\", \"sentence_bad\": \"\\\"The student perform.\\\"\", \"field\": \"\\\"morphology\\\"\", \"linguistics_term\": \"\\\"subject_verb_agreement\\\"\", \"UID\": \"\\\"regular_plural_subject_verb_agreement_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "sentential_negation_npi_licensor_present": {"config_name": "sentential_negation_npi_licensor_present", "sample_row": "{\"sentence_good\": \"\\\"Teresa had not ever sold a movie theater.\\\"\", \"sentence_bad\": \"\\\"Teresa had probably ever sold a movie theater.\\\"...\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"sentential_negation_npi_licensor_present\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "sentential_negation_npi_scope": {"config_name": "sentential_negation_npi_scope", "sample_row": "{\"sentence_good\": \"\\\"The associations that had worried Cynthia have no...\", \"sentence_bad\": \"\\\"The associations that had not worried Cynthia hav...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"npi_licensing\\\"\", \"UID\": \"\\\"sentential_negation_npi_scope\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "sentential_subject_island": {"config_name": "sentential_subject_island", "sample_row": "{\"sentence_good\": \"\\\"Who had the patients' cleaning those banks upset....\", \"sentence_bad\": \"\\\"Who had the patients' cleaning upset those banks....\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"sentential_subject_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"true\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "superlative_quantifiers_1": {"config_name": "superlative_quantifiers_1", "sample_row": "{\"sentence_good\": \"\\\"No girl attacked fewer than two waiters.\\\"\", \"sentence_bad\": \"\\\"No girl attacked at most two waiters.\\\"\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"superlative_quantifiers_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "superlative_quantifiers_2": {"config_name": "superlative_quantifiers_2", "sample_row": "{\"sentence_good\": \"\\\"The teenager does tour at most nine restaurants.\\\"...\", \"sentence_bad\": \"\\\"No teenager does tour at most nine restaurants.\\\"...\", \"field\": \"\\\"semantics\\\"\", \"linguistics_term\": \"\\\"quantifiers\\\"\", \"UID\": \"\\\"superlative_quantifiers_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "tough_vs_raising_1": {"config_name": "tough_vs_raising_1", "sample_row": "{\"sentence_good\": \"\\\"James is pleasant to flee from.\\\"\", \"sentence_bad\": \"\\\"James is apt to flee from.\\\"\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"tough_vs_raising_1\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "tough_vs_raising_2": {"config_name": "tough_vs_raising_2", "sample_row": "{\"sentence_good\": \"\\\"Every hospital isn't about to tempt Tiffany to re...\", \"sentence_bad\": \"\\\"Every hospital isn't fun to tempt Tiffany to refe...\", \"field\": \"\\\"syntax_semantics\\\"\", \"linguistics_term\": \"\\\"control_raising\\\"\", \"UID\": \"\\\"tough_vs_raising_2\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "transitive": {"config_name": "transitive", "sample_row": "{\"sentence_good\": \"\\\"Some turtles alarm Kimberley.\\\"\", \"sentence_bad\": \"\\\"Some turtles come here Kimberley.\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"argument_structure\\\"\", \"UID\": \"\\\"transitive\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"true\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_island": {"config_name": "wh_island", "sample_row": "{\"sentence_good\": \"\\\"Who have those men revealed they helped?\\\"\", \"sentence_bad\": \"\\\"Who have those men revealed who helped?\\\"\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"island_effects\\\"\", \"UID\": \"\\\"wh_island\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"true\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_questions_object_gap": {"config_name": "wh_questions_object_gap", "sample_row": "{\"sentence_good\": \"\\\"Joel discovered the vase that Patricia might take...\", \"sentence_bad\": \"\\\"Joel discovered what Patricia might take the vase...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_questions_object_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_questions_subject_gap": {"config_name": "wh_questions_subject_gap", "sample_row": "{\"sentence_good\": \"\\\"Brian had questioned an association that can asto...\", \"sentence_bad\": \"\\\"Brian had questioned who an association can astou...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_questions_subject_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_questions_subject_gap_long_distance": {"config_name": "wh_questions_subject_gap_long_distance", "sample_row": "{\"sentence_good\": \"\\\"Dennis has seen this tooth that Kristin wasn't co...\", \"sentence_bad\": \"\\\"Dennis has seen who this tooth that Kristin wasn'...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_questions_subject_gap_long_distance\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_no_gap": {"config_name": "wh_vs_that_no_gap", "sample_row": "{\"sentence_good\": \"\\\"Mark figured out that most governments appreciate...\", \"sentence_bad\": \"\\\"Mark figured out who most governments appreciate ...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_no_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_no_gap_long_distance": {"config_name": "wh_vs_that_no_gap_long_distance", "sample_row": "{\"sentence_good\": \"\\\"Every association figured out that most drivers t...\", \"sentence_bad\": \"\\\"Every association figured out who most drivers th...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_no_gap_long_distance\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_with_gap": {"config_name": "wh_vs_that_with_gap", "sample_row": "{\"sentence_good\": \"\\\"A lady has remembered who the actors conceal.\\\"\", \"sentence_bad\": \"\\\"A lady has remembered that the actors conceal.\\\"...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_with_gap\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}, "wh_vs_that_with_gap_long_distance": {"config_name": "wh_vs_that_with_gap_long_distance", "sample_row": "{\"sentence_good\": \"\\\"Kayla concealed who a lot of guests that were sca...\", \"sentence_bad\": \"\\\"Kayla concealed that a lot of guests that were sc...\", \"field\": \"\\\"syntax\\\"\", \"linguistics_term\": \"\\\"filler_gap_dependency\\\"\", \"UID\": \"\\\"wh_vs_that_with_gap_long_distance\\\"\", \"simple_LM_method\": \"true\", \"one_prefix_method\": \"false\", \"two_prefix_method\": \"false\", \"lexically_identical\": \"false\", \"pair_id\": \"0\"}", "columns": ["sentence_good", "sentence_bad", "field", "linguistics_term", "UID", "simple_LM_method", "one_prefix_method", "two_prefix_method", "lexically_identical", "pair_id"], "columns_mapping": {"sentence_good": "sentence_good", "sentence_bad": "sentence_bad", "field": "field", "linguistics_term": "linguistics_term", "UID": "UID", "simple_LM_method": "simple_LM_method", "one_prefix_method": "one_prefix_method", "two_prefix_method": "two_prefix_method", "lexically_identical": "lexically_identical", "pair_id": "pair_id"}, "dataset_description": "\nBLiMP is a challenge set for evaluating what language models (LMs) know about\nmajor grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\ncontaining 1000 minimal pairs isolating specific contrasts in syntax,\nmorphology, or semantics. The data is automatically generated according to\nexpert-crafted grammars.\n", "dataset_name": "blimp"}}, "tags": ["task_categories:text-classification", "task_ids:acceptability-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "blog_authorship_corpus": {"dataset_name": "blog_authorship_corpus", "description": "The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person.\n\nEach blog is presented as a separate file, the name of which indicates a blogger id# and the blogger\u2019s self-provided gender, age, industry and astrological sign. (All are labeled for gender and age but for many, industry and/or sign is marked as unknown.)\n\nAll bloggers included in the corpus fall into one of three age groups:\n- 8240 \"10s\" blogs (ages 13-17),\n- 8086 \"20s\" blogs (ages 23-27),\n- 2994 \"30s\" blogs (ages 33-47).\n\nFor each age group there are an equal number of male and female bloggers.\n\nEach blog in the corpus includes at least 200 occurrences of common English words. All formatting has been stripped with two exceptions. Individual posts within a single blogger are separated by the date of the following post and links within a post are denoted by the label urllink.\n\nThe corpus may be freely used for non-commercial research purposes.", "downloads": 406, "configs": {"blog_authorship_corpus": {"config_name": "blog_authorship_corpus", "sample_row": "{\"text\": \"\\\"Yeah, sorry for not writing for a whole there, bu...\", \"date\": \"\\\"23,November,2002\\\"\", \"gender\": \"\\\"female\\\"\", \"age\": \"17\", \"horoscope\": \"\\\"Libra\\\"\", \"job\": \"\\\"Student\\\"\"}", "columns": ["text", "date", "gender", "age", "horoscope", "job"], "columns_mapping": {"text": "text", "date": "date", "gender": "gender", "age": "age", "horoscope": "horoscope", "job": "job"}, "dataset_description": "The Blog Authorship Corpus consists of the collected posts of 19,320 bloggers gathered from blogger.com in August 2004. The corpus incorporates a total of 681,288 posts and over 140 million words - or approximately 35 posts and 7250 words per person.\n\nEach blog is presented as a separate file, the name of which indicates a blogger id# and the blogger\u2019s self-provided gender, age, industry and astrological sign. (All are labeled for gender and age but for many, industry and/or sign is marked as unknown.)\n\nAll bloggers included in the corpus fall into one of three age groups:\n- 8240 \"10s\" blogs (ages 13-17),\n- 8086 \"20s\" blogs (ages 23-27),\n- 2994 \"30s\" blogs (ages 33-47).\n\nFor each age group there are an equal number of male and female bloggers.\n\nEach blog in the corpus includes at least 200 occurrences of common English words. All formatting has been stripped with two exceptions. Individual posts within a single blogger are separated by the date of the following post and links within a post are denoted by the label urllink.\n\nThe corpus may be freely used for non-commercial research purposes.\n", "dataset_name": "blog_authorship_corpus"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bn_hate_speech": {"dataset_name": "bn_hate_speech", "description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.", "downloads": 1094, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"\\\\u0987\\\\u09a8\\\\u09bf\\\\u0987 \\\\u09b9\\\\u099a\\\\u09cd\\\\u099b...\", \"label\": \"3\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "dataset_name": "bn_hate_speech"}}, "tags": ["task_categories:text-classification", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:bn", "hate-speech-topic-classification"], "is_gated": false}, "bookcorpus": {"dataset_name": "bookcorpus", "description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. \\", "downloads": 16732, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"usually , he would be tearing around the living r...\"}", "columns": ["text"], "columns_mapping": {"text": "text"}, "dataset_description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. ", "dataset_name": "bookcorpus"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "boolq": {"dataset_name": "boolq", "description": "BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally\noccurring ---they are generated in unprompted and unconstrained settings.\nEach example is a triplet of (question, passage, answer), with the title of the page as optional additional context.\nThe text-pair classification setup is similar to existing natural language inference tasks.", "downloads": 27215, "configs": {"default": {"config_name": "default", "sample_row": "{\"question\": \"\\\"do iran and afghanistan speak the same language\\\"...\", \"answer\": \"true\", \"passage\": \"\\\"Persian (/\\\\u02c8p\\\\u025c\\\\u02d0r\\\\u0292\\\\u0259n, -\\\\u0...\"}", "columns": ["question", "answer", "passage"], "columns_mapping": {"question": "question", "answer": "answer", "passage": "passage"}, "dataset_description": "BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally\noccurring ---they are generated in unprompted and unconstrained settings.\nEach example is a triplet of (question, passage, answer), with the title of the page as optional additional context.\nThe text-pair classification setup is similar to existing natural language inference tasks.\n", "dataset_name": "boolq"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "bsd_ja_en": {"dataset_name": "bsd_ja_en", "description": "This is the Business Scene Dialogue (BSD) dataset,\na Japanese-English parallel corpus containing written conversations\nin various business scenarios.\n\nThe dataset was constructed in 3 steps:\n 1) selecting business scenes,\n 2) writing monolingual conversation scenarios according to the selected scenes, and\n 3) translating the scenarios into the other language.\n\nHalf of the monolingual scenarios were written in Japanese\nand the other half were written in English.\n\nFields:\n- id: dialogue identifier\n- no: sentence pair number within a dialogue\n- en_speaker: speaker name in English\n- ja_speaker: speaker name in Japanese\n- en_sentence: sentence in English\n- ja_sentence: sentence in Japanese\n- original_language: language in which monolingual scenario was written\n- tag: scenario\n- title: scenario title", "downloads": 443, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"190329_J07_03\\\"\", \"tag\": \"\\\"phone call\\\"\", \"title\": \"\\\"\\\\u4f1d\\\\u8a00\\\\u3078\\\\u306e\\\\u6298\\\\u308a\\\\u8fd4\\\\u3057\\\\...\", \"original_language\": \"\\\"ja\\\"\", \"no\": \"1\", \"en_speaker\": \"\\\"Doi-san\\\"\", \"ja_speaker\": \"\\\"\\\\u571f\\\\u4e95\\\\u3055\\\\u3093\\\"\", \"en_sentence\": \"\\\"Hi this is the systems development department of ...\", \"ja_sentence\": \"\\\"\\\\u306f\\\\u3044\\\\u3001K\\\\u793e\\\\u30b7\\\\u30b9\\\\u30c6\\\\u30e0...\"}", "columns": ["id", "tag", "title", "original_language", "no", "en_speaker", "ja_speaker", "en_sentence", "ja_sentence"], "columns_mapping": {"id": "id", "tag": "tag", "title": "title", "original_language": "original_language", "no": "no", "en_speaker": "en_speaker", "ja_speaker": "ja_speaker", "en_sentence": "en_sentence", "ja_sentence": "ja_sentence"}, "dataset_description": "This is the Business Scene Dialogue (BSD) dataset,\na Japanese-English parallel corpus containing written conversations\nin various business scenarios.\n\nThe dataset was constructed in 3 steps:\n 1) selecting business scenes,\n 2) writing monolingual conversation scenarios according to the selected scenes, and\n 3) translating the scenarios into the other language.\n\nHalf of the monolingual scenarios were written in Japanese\nand the other half were written in English.\n\nFields:\n- id: dialogue identifier\n- no: sentence pair number within a dialogue\n- en_speaker: speaker name in English\n- ja_speaker: speaker name in Japanese\n- en_sentence: sentence in English\n- ja_sentence: sentence in Japanese\n- original_language: language in which monolingual scenario was written\n- tag: scenario\n- title: scenario title\n", "dataset_name": "bsd_ja_en"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:ja", "business-conversations-translation"], "is_gated": false}, "c3": {"dataset_name": "c3", "description": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations.\nWe present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text.", "downloads": 1234, "configs": {"mixed": {"config_name": "mixed", "sample_row": "{\"documents\": \"[\\\"\\\\u8bb8\\\\u591a\\\\u52a8\\\\u7269\\\\u7684\\\\u67d0\\\\u4e9b\\\\u5668...\", \"document_id\": \"\\\"m13-70\\\"\", \"questions.question\": \"[\\\"\\\\u52a8\\\\u7269\\\\u7684\\\\u5668\\\\u5b98\\\\u611f\\\\u89c9\\\\u4e0e...\", \"questions.answer\": \"[\\\"\\\\u6bd4\\\\u4eba\\\\u7684\\\\u7075\\\\u654f\\\", \\\"\\\\u6c34\\\\u6bcd\\\",...\", \"questions.choice\": \"[[\\\"\\\\u6ca1\\\\u6709\\\\u4eba\\\\u7684\\\\u7075\\\\u654f\\\", \\\"\\\\u548c\\\\...\"}", "columns": ["documents", "document_id", "questions_question", "questions_answer", "questions_choice"], "columns_mapping": {"documents": "documents", "document_id": "document_id", "questions.question": "questions_question", "questions.answer": "questions_answer", "questions.choice": "questions_choice"}, "dataset_description": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations.\nWe present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text.\n", "dataset_name": "c3"}, "dialog": {"config_name": "dialog", "sample_row": "{\"documents\": \"[\\\"\\\\u7537\\\\uff1a\\\\u4f60\\\\u4eca\\\\u5929\\\\u665a\\\\u4e0a\\\\u6709...\", \"document_id\": \"\\\"25-35\\\"\", \"questions.question\": \"[\\\"\\\\u5973\\\\u7684\\\\u6700\\\\u559c\\\\u6b22\\\\u54ea\\\\u79cd\\\\u7535...\", \"questions.answer\": \"[\\\"\\\\u559c\\\\u5267\\\\u7247\\\"]\", \"questions.choice\": \"[[\\\"\\\\u6050\\\\u6016\\\\u7247\\\", \\\"\\\\u7231\\\\u60c5\\\\u7247\\\", \\\"\\\\u5...\"}", "columns": ["documents", "document_id", "questions_question", "questions_answer", "questions_choice"], "columns_mapping": {"documents": "documents", "document_id": "document_id", "questions.question": "questions_question", "questions.answer": "questions_answer", "questions.choice": "questions_choice"}, "dataset_description": "Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations.\nWe present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text.\n", "dataset_name": "c3"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:zh"], "is_gated": false}, "c4": {"dataset_name": "c4", "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.", "downloads": 68692, "configs": {"en": {"config_name": "en", "sample_row": "{\"text\": \"\\\"Beginners BBQ Class Taking Place in Missoula!\\\\nDo...\", \"timestamp\": \"\\\"2019-04-25T12:57:54Z\\\"\", \"url\": \"\\\"https://klyq.com/beginners-bbq-class-taking-place...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}, "realnewslike": {"config_name": "realnewslike", "sample_row": "{\"text\": \"\\\"After the martyrdom of St. Boniface, Vergilius wa...\", \"timestamp\": \"\\\"2019-04-22T08:07:02Z\\\"\", \"url\": \"\\\"https://www.catholic.org/encyclopedia/view.php?id...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}, "en.noblocklist": {"config_name": "en.noblocklist", "sample_row": "{\"text\": \"\\\"Beginners BBQ Class Taking Place in Missoula!\\\\nDo...\", \"timestamp\": \"\\\"2019-04-25T12:57:54Z\\\"\", \"url\": \"\\\"https://klyq.com/beginners-bbq-class-taking-place...\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}, "en.noclean": {"config_name": "en.noclean", "sample_row": "{\"text\": \"\\\"November 24, 2016 \\\\u2013 World News, Breaking New...\", \"timestamp\": \"\\\"2019-04-24T16:35:11Z\\\"\", \"url\": \"\\\"http://sevendaynews.com/2016/11/24/\\\"\"}", "columns": ["text", "timestamp", "url"], "columns_mapping": {"text": "text", "timestamp": "timestamp", "url": "url"}, "dataset_description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "dataset_name": "c4"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:en"], "is_gated": false}, "caner": {"dataset_name": "caner", "description": "Classical Arabic Named Entity Recognition corpus as a new corpus of tagged data that can be useful for handling the issues in recognition of Arabic named entities.", "downloads": 347, "configs": {"default": {"config_name": "default", "sample_row": "{\"token\": \"\\\"\\\\u0627\\\\u0644\\\\u062c\\\\u0627\\\\u0645\\\\u0639\\\"\", \"ner_tag\": \"1\"}", "columns": ["token", "ner_tag"], "columns_mapping": {"token": "token", "ner_tag": "ner_tag"}, "dataset_description": "Classical Arabic Named Entity Recognition corpus as a new corpus of tagged data that can be useful for handling the issues in recognition of Arabic named entities.\n", "dataset_name": "caner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "casino": {"dataset_name": "casino", "description": "We provide a novel dataset (referred to as CaSiNo) of 1030 negotiation dialogues. Two participants take the role of campsite neighbors and negotiate for Food, Water, and Firewood packages, based on their individual preferences and requirements. This design keeps the task tractable, while still facilitating linguistically rich and personal conversations. This helps to overcome the limitations of prior negotiation datasets such as Deal or No Deal and Craigslist Bargain. Each dialogue consists of rich meta-data including participant demographics, personality, and their subjective evaluation of the negotiation in terms of satisfaction and opponent likeness.", "downloads": 338, "configs": {"default": {"config_name": "default", "sample_row": "{\"chat_logs\": \"[{\\\"text\\\": \\\"Hello! \\\\ud83d\\\\ude42 Let's work together...\", \"participant_info.mturk_agent_1.value2issue.Low\": \"\\\"Water\\\"\", \"participant_info.mturk_agent_1.value2issue.Medium\": \"\\\"Food\\\"\", \"participant_info.mturk_agent_1.value2issue.High\": \"\\\"Firewood\\\"\", \"participant_info.mturk_agent_1.value2reason.Low\": \"\\\"Our group has sufficient water from our complemen...\", \"participant_info.mturk_agent_1.value2reason.Medium\": \"\\\"Extra food will be needed to feed our larger than...\", \"participant_info.mturk_agent_1.value2reason.High\": \"\\\"We have a larger group than normal and therefore ...\", \"participant_info.mturk_agent_1.outcomes.points_scored\": \"19\", \"participant_info.mturk_agent_1.outcomes.satisfaction\": \"\\\"Slightly satisfied\\\"\", \"participant_info.mturk_agent_1.outcomes.opponent_likeness\": \"\\\"Slightly like\\\"\", \"participant_info.mturk_agent_1.demographics.age\": \"43\", \"participant_info.mturk_agent_1.demographics.gender\": \"\\\"male\\\"\", \"participant_info.mturk_agent_1.demographics.ethnicity\": \"\\\"white american\\\"\", \"participant_info.mturk_agent_1.demographics.education\": \"\\\"some 4 year college, bachelor's degree\\\"\", \"participant_info.mturk_agent_1.personality.svo\": \"\\\"proself\\\"\", \"participant_info.mturk_agent_1.personality.big-five.extraversion\": \"5.0\", \"participant_info.mturk_agent_1.personality.big-five.agreeableness\": \"6.0\", \"participant_info.mturk_agent_1.personality.big-five.conscientiousness\": \"6.0\", \"participant_info.mturk_agent_1.personality.big-five.emotional-stability\": \"5.0\", \"participant_info.mturk_agent_1.personality.big-five.openness-to-experiences\": \"5.5\", \"participant_info.mturk_agent_2.value2issue.Low\": \"\\\"Food\\\"\", \"participant_info.mturk_agent_2.value2issue.Medium\": \"\\\"Water\\\"\", \"participant_info.mturk_agent_2.value2issue.High\": \"\\\"Firewood\\\"\", \"participant_info.mturk_agent_2.value2reason.Low\": \"\\\"i'm on a diet, trying to lose weight.\\\"\", \"participant_info.mturk_agent_2.value2reason.Medium\": \"\\\"i'm dehydrated, and i need to drink constantly.\\\"...\", \"participant_info.mturk_agent_2.value2reason.High\": \"\\\"my dog has fleas, the fire repels them.\\\"\", \"participant_info.mturk_agent_2.outcomes.points_scored\": \"18\", \"participant_info.mturk_agent_2.outcomes.satisfaction\": \"\\\"Extremely satisfied\\\"\", \"participant_info.mturk_agent_2.outcomes.opponent_likeness\": \"\\\"Extremely like\\\"\", \"participant_info.mturk_agent_2.demographics.age\": \"22\", \"participant_info.mturk_agent_2.demographics.gender\": \"\\\"female\\\"\", \"participant_info.mturk_agent_2.demographics.ethnicity\": \"\\\"asian american\\\"\", \"participant_info.mturk_agent_2.demographics.education\": \"\\\"some 4 year college, bachelor's degree\\\"\", \"participant_info.mturk_agent_2.personality.svo\": \"\\\"proself\\\"\", \"participant_info.mturk_agent_2.personality.big-five.extraversion\": \"4.0\", \"participant_info.mturk_agent_2.personality.big-five.agreeableness\": \"6.0\", \"participant_info.mturk_agent_2.personality.big-five.conscientiousness\": \"5.5\", \"participant_info.mturk_agent_2.personality.big-five.emotional-stability\": \"3.0\", \"participant_info.mturk_agent_2.personality.big-five.openness-to-experiences\": \"7.0\", \"annotations\": \"[[\\\"Hello! \\\\ud83d\\\\ude42 Let's work together on a de...\"}", "columns": ["chat_logs", "participant_info_mturk_agent_1_value2issue_Low", "participant_info_mturk_agent_1_value2issue_Medium", "participant_info_mturk_agent_1_value2issue_High", "participant_info_mturk_agent_1_value2reason_Low", "participant_info_mturk_agent_1_value2reason_Medium", "participant_info_mturk_agent_1_value2reason_High", "participant_info_mturk_agent_1_outcomes_points_scored", "participant_info_mturk_agent_1_outcomes_satisfaction", "participant_info_mturk_agent_1_outcomes_opponent_likeness", "participant_info_mturk_agent_1_demographics_age", "participant_info_mturk_agent_1_demographics_gender", "participant_info_mturk_agent_1_demographics_ethnicity", "participant_info_mturk_agent_1_demographics_education", "participant_info_mturk_agent_1_personality_svo", "participant_info_mturk_agent_1_personality_big-five_extraversion", "participant_info_mturk_agent_1_personality_big-five_agreeableness", "participant_info_mturk_agent_1_personality_big-five_conscientiousness", "participant_info_mturk_agent_1_personality_big-five_emotional-stability", "participant_info_mturk_agent_1_personality_big-five_openness-to-experiences", "participant_info_mturk_agent_2_value2issue_Low", "participant_info_mturk_agent_2_value2issue_Medium", "participant_info_mturk_agent_2_value2issue_High", "participant_info_mturk_agent_2_value2reason_Low", "participant_info_mturk_agent_2_value2reason_Medium", "participant_info_mturk_agent_2_value2reason_High", "participant_info_mturk_agent_2_outcomes_points_scored", "participant_info_mturk_agent_2_outcomes_satisfaction", "participant_info_mturk_agent_2_outcomes_opponent_likeness", "participant_info_mturk_agent_2_demographics_age", "participant_info_mturk_agent_2_demographics_gender", "participant_info_mturk_agent_2_demographics_ethnicity", "participant_info_mturk_agent_2_demographics_education", "participant_info_mturk_agent_2_personality_svo", "participant_info_mturk_agent_2_personality_big-five_extraversion", "participant_info_mturk_agent_2_personality_big-five_agreeableness", "participant_info_mturk_agent_2_personality_big-five_conscientiousness", "participant_info_mturk_agent_2_personality_big-five_emotional-stability", "participant_info_mturk_agent_2_personality_big-five_openness-to-experiences", "annotations"], "columns_mapping": {"chat_logs": "chat_logs", "participant_info.mturk_agent_1.value2issue.Low": "participant_info_mturk_agent_1_value2issue_Low", "participant_info.mturk_agent_1.value2issue.Medium": "participant_info_mturk_agent_1_value2issue_Medium", "participant_info.mturk_agent_1.value2issue.High": "participant_info_mturk_agent_1_value2issue_High", "participant_info.mturk_agent_1.value2reason.Low": "participant_info_mturk_agent_1_value2reason_Low", "participant_info.mturk_agent_1.value2reason.Medium": "participant_info_mturk_agent_1_value2reason_Medium", "participant_info.mturk_agent_1.value2reason.High": "participant_info_mturk_agent_1_value2reason_High", "participant_info.mturk_agent_1.outcomes.points_scored": "participant_info_mturk_agent_1_outcomes_points_scored", "participant_info.mturk_agent_1.outcomes.satisfaction": "participant_info_mturk_agent_1_outcomes_satisfaction", "participant_info.mturk_agent_1.outcomes.opponent_likeness": "participant_info_mturk_agent_1_outcomes_opponent_likeness", "participant_info.mturk_agent_1.demographics.age": "participant_info_mturk_agent_1_demographics_age", "participant_info.mturk_agent_1.demographics.gender": "participant_info_mturk_agent_1_demographics_gender", "participant_info.mturk_agent_1.demographics.ethnicity": "participant_info_mturk_agent_1_demographics_ethnicity", "participant_info.mturk_agent_1.demographics.education": "participant_info_mturk_agent_1_demographics_education", "participant_info.mturk_agent_1.personality.svo": "participant_info_mturk_agent_1_personality_svo", "participant_info.mturk_agent_1.personality.big-five.extraversion": "participant_info_mturk_agent_1_personality_big-five_extraversion", "participant_info.mturk_agent_1.personality.big-five.agreeableness": "participant_info_mturk_agent_1_personality_big-five_agreeableness", "participant_info.mturk_agent_1.personality.big-five.conscientiousness": "participant_info_mturk_agent_1_personality_big-five_conscientiousness", "participant_info.mturk_agent_1.personality.big-five.emotional-stability": "participant_info_mturk_agent_1_personality_big-five_emotional-stability", "participant_info.mturk_agent_1.personality.big-five.openness-to-experiences": "participant_info_mturk_agent_1_personality_big-five_openness-to-experiences", "participant_info.mturk_agent_2.value2issue.Low": "participant_info_mturk_agent_2_value2issue_Low", "participant_info.mturk_agent_2.value2issue.Medium": "participant_info_mturk_agent_2_value2issue_Medium", "participant_info.mturk_agent_2.value2issue.High": "participant_info_mturk_agent_2_value2issue_High", "participant_info.mturk_agent_2.value2reason.Low": "participant_info_mturk_agent_2_value2reason_Low", "participant_info.mturk_agent_2.value2reason.Medium": "participant_info_mturk_agent_2_value2reason_Medium", "participant_info.mturk_agent_2.value2reason.High": "participant_info_mturk_agent_2_value2reason_High", "participant_info.mturk_agent_2.outcomes.points_scored": "participant_info_mturk_agent_2_outcomes_points_scored", "participant_info.mturk_agent_2.outcomes.satisfaction": "participant_info_mturk_agent_2_outcomes_satisfaction", "participant_info.mturk_agent_2.outcomes.opponent_likeness": "participant_info_mturk_agent_2_outcomes_opponent_likeness", "participant_info.mturk_agent_2.demographics.age": "participant_info_mturk_agent_2_demographics_age", "participant_info.mturk_agent_2.demographics.gender": "participant_info_mturk_agent_2_demographics_gender", "participant_info.mturk_agent_2.demographics.ethnicity": "participant_info_mturk_agent_2_demographics_ethnicity", "participant_info.mturk_agent_2.demographics.education": "participant_info_mturk_agent_2_demographics_education", "participant_info.mturk_agent_2.personality.svo": "participant_info_mturk_agent_2_personality_svo", "participant_info.mturk_agent_2.personality.big-five.extraversion": "participant_info_mturk_agent_2_personality_big-five_extraversion", "participant_info.mturk_agent_2.personality.big-five.agreeableness": "participant_info_mturk_agent_2_personality_big-five_agreeableness", "participant_info.mturk_agent_2.personality.big-five.conscientiousness": "participant_info_mturk_agent_2_personality_big-five_conscientiousness", "participant_info.mturk_agent_2.personality.big-five.emotional-stability": "participant_info_mturk_agent_2_personality_big-five_emotional-stability", "participant_info.mturk_agent_2.personality.big-five.openness-to-experiences": "participant_info_mturk_agent_2_personality_big-five_openness-to-experiences", "annotations": "annotations"}, "dataset_description": "We provide a novel dataset (referred to as CaSiNo) of 1030 negotiation dialogues. Two participants take the role of campsite neighbors and negotiate for Food, Water, and Firewood packages, based on their individual preferences and requirements. This design keeps the task tractable, while still facilitating linguistically rich and personal conversations. This helps to overcome the limitations of prior negotiation datasets such as Deal or No Deal and Craigslist Bargain. Each dialogue consists of rich meta-data including participant demographics, personality, and their subjective evaluation of the negotiation in terms of satisfaction and opponent likeness.\n", "dataset_name": "casino"}}, "tags": ["task_categories:conversational", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "catalonia_independence": {"dataset_name": "catalonia_independence", "description": "This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.\n\nEach corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.", "downloads": 510, "configs": {"catalan": {"config_name": "catalan", "sample_row": "{\"id_str\": \"\\\"11028517837209518e+18\\\"\", \"TWEET\": \"\\\"En @fgarrobo ha fet m\\\\u00e9s per l\\\\u2019independe...\", \"LABEL\": \"0\"}", "columns": ["id_str", "TWEET", "LABEL"], "columns_mapping": {"id_str": "id_str", "TWEET": "TWEET", "LABEL": "LABEL"}, "dataset_description": "This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.\n\nEach corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.\n", "dataset_name": "catalonia_independence"}, "spanish": {"config_name": "spanish", "sample_row": "{\"id_str\": \"\\\"1099284472267182080\\\"\", \"TWEET\": \"\\\"RT @EFEnoticias: Arrimadas se presenta a las gene...\", \"LABEL\": \"0\"}", "columns": ["id_str", "TWEET", "LABEL"], "columns_mapping": {"id_str": "id_str", "TWEET": "TWEET", "LABEL": "LABEL"}, "dataset_description": "This dataset contains two corpora in Spanish and Catalan that consist of annotated Twitter messages for automatic stance detection. The data was collected over 12 days during February and March of 2019 from tweets posted in Barcelona, and during September of 2018 from tweets posted in the town of Terrassa, Catalonia.\n\nEach corpus is annotated with three classes: AGAINST, FAVOR and NEUTRAL, which express the stance towards the target - independence of Catalonia.\n", "dataset_name": "catalonia_independence"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ca", "language:es", "stance-detection"], "is_gated": false}, "cbt": {"dataset_name": "cbt", "description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.", "downloads": 1667, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"title\": \"\\\"Andrew_Lang___Prince_Prigio.txt.out\\\"\", \"content\": \"\\\"CHAPTER I. -LCB- Chapter heading picture : p1.jpg...\"}", "columns": ["title", "content"], "columns_mapping": {"title": "title", "content": "content"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "V": {"config_name": "V", "sample_row": "{\"sentences\": \"[\\\"This vexed the king even more than the queen , w...\", \"question\": \"\\\"`` They are very kind old ladies in their way , '...\", \"answer\": \"\\\"said\\\"\", \"options\": \"[\\\"christening\\\", \\\"existed\\\", \\\"hear\\\", \\\"knows\\\", \\\"read\\\"...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "P": {"config_name": "P", "sample_row": "{\"sentences\": \"[\\\"CHAPTER I. -LCB- Chapter heading picture : p1.jp...\", \"question\": \"\\\"`` You have not forgotten any XXXXX our aunts ? '...\", \"answer\": \"\\\"of\\\"\", \"options\": \"[\\\"With\\\", \\\"before\\\", \\\"in\\\", \\\"of\\\", \\\"on\\\", \\\"than\\\", \\\"that...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "NE": {"config_name": "NE", "sample_row": "{\"sentences\": \"[\\\"Some were abroad ; several were ill ; a few were...\", \"question\": \"\\\"I think I 've told you that his name was XXXXX --...\", \"answer\": \"\\\"Prigio\\\"\", \"options\": \"[\\\"CHAPTER\\\", \\\"Flitter\\\", \\\"Prigio\\\", \\\"Saracens\\\", \\\"lumb...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}, "CN": {"config_name": "CN", "sample_row": "{\"sentences\": \"[\\\"With almost everything else to make them happy ,...\", \"question\": \"\\\"replied the XXXXX ; for the king 's aunts were ol...\", \"answer\": \"\\\"queen\\\"\", \"options\": \"[\\\"ancestors\\\", \\\"baby\\\", \\\"boy\\\", \\\"everyone\\\", \\\"fairies\\\"...\"}", "columns": ["sentences", "question", "answer", "options"], "columns_mapping": {"sentences": "sentences", "question": "question", "answer": "answer", "options": "options"}, "dataset_description": "The Children\u2019s Book Test (CBT) is designed to measure directly\nhow well language models can exploit wider linguistic context.\nThe CBT is built from books that are freely available.\n", "dataset_name": "cbt"}}, "tags": ["task_categories:other", "task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cc100": {"dataset_name": "cc100", "description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.", "downloads": 9293, "configs": {"am": {"config_name": "am", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u1270\\\\u1208\\\\u12cb\\\\u12cb\\\\u132d \\\\u12e8\\\\u130d\\\\u12f5...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.\n", "dataset_name": "cc100"}, "sr": {"config_name": "sr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u2626 \\\\u041e\\\\u0431\\\\u043d\\\\u0430\\\\u0432\\\\u0459\\\\u0430...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.\n", "dataset_name": "cc100"}, "ka": {"config_name": "ka", "sample_row": "{\"id\": \"\\\"0\\\"\", \"text\": \"\\\"\\\\u10d4\\\\u10e0\\\\u10dd\\\\u10d5\\\\u10dc\\\\u10e3\\\\u10da\\\\u10d8 ...\"}", "columns": ["id", "text"], "columns_mapping": {"id": "id", "text": "text"}, "dataset_description": "This corpus is an attempt to recreate the dataset used for training XLM-R. This corpus comprises of monolingual data for 100+ languages and also includes data for romanized languages (indicated by *_rom). This was constructed using the urls and paragraph indices provided by the CC-Net repository by processing January-December 2018 Commoncrawl snapshots. Each file comprises of documents separated by double-newlines and paragraphs within the same document separated by a newline. The data is generated using the open source CC-Net repository. No claims of intellectual property are made on the work of preparation of the corpus.\n", "dataset_name": "cc100"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:af", "language:am", "language:ar", "language:as", "language:az", "language:be", "language:bg", "language:bn", "language:br", "language:bs", "language:ca", "language:cs", "language:cy", "language:da", "language:de", "language:el", "language:en", "language:eo", "language:es", "language:et", "language:eu", "language:fa", "language:ff", "language:fi", "language:fr", "language:fy", "language:ga", "language:gd", "language:gl", "language:gn", "language:gu", "language:ha", "language:he", "language:hi", "language:hr", "language:ht", "language:hu", "language:hy", "language:id", "language:ig", "language:is", "language:it", "language:ja", "language:jv", "language:ka", "language:kk", "language:km", "language:kn", "language:ko", "language:ku", "language:ky", "language:la", "language:lg", "language:li", "language:ln", "language:lo", "language:lt", "language:lv", "language:mg", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:my", "language:ne", "language:nl", "language:no", "language:ns", "language:om", "language:or", "language:pa", "language:pl", "language:ps", "language:pt", "language:qu", "language:rm", "language:ro", "language:ru", "language:sa", "language:sc", "language:sd", "language:si", "language:sk", "language:sl", "language:so", "language:sq", "language:sr", "language:ss", "language:su", "language:sv", "language:sw", "language:ta", "language:te", "language:th", "language:tl", "language:tn", "language:tr", "language:ug", "language:uk", "language:ur", "language:uz", "language:vi", "language:wo", "language:xh", "language:yi", "language:yo", "language:zh", "language:zu"], "is_gated": false}, "cc_news": {"dataset_name": "cc_news", "description": "CC-News containing news articles from news sites all over the world The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. This version of the dataset has 708241 articles. It represents a small portion of English language subset of the CC-News dataset created using news-please(Hamborg et al.,2017) to collect and extract English language portion of CC-News.", "downloads": 2681, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"title\": \"\\\"Daughter Duo is Dancing in The Same Company\\\"\", \"text\": \"\\\"There's a surprising twist to Regina Willoughby's...\", \"domain\": \"\\\"www.pointemagazine.com\\\"\", \"date\": \"\\\"2017-12-11 20:19:05\\\"\", \"description\": \"\\\"There's a surprising twist to Regina Willoughby's...\", \"url\": \"\\\"http://www.pointemagazine.com/mother-daughter-duo...\", \"image_url\": \"\\\"https://pointe-img.rbl.ms/simage/https%3A%2F%2Fas...\"}", "columns": ["title", "text", "domain", "date", "description", "url", "image_url"], "columns_mapping": {"title": "title", "text": "text", "domain": "domain", "date": "date", "description": "description", "url": "url", "image_url": "image_url"}, "dataset_description": "CC-News containing news articles from news sites all over the world The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. This version of the dataset has 708241 articles. It represents a small portion of English language subset of the CC-News dataset created using news-please(Hamborg et al.,2017) to collect and extract English language portion of CC-News.\n", "dataset_name": "cc_news"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "ccaligned_multilingual": {"dataset_name": "ccaligned_multilingual", "description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).", "downloads": 1220, "configs": {"documents-zz_TR": {"config_name": "documents-zz_TR", "sample_row": "{\"Domain\": \"\\\"wext.it\\\"\", \"Source_URL\": \"\\\"http://wext.it/en/\\\"\", \"Target_URL\": \"\\\"https://wext.it/\\\"\", \"translation.en_XX\": \"\\\"wext.it|wext.it|Software WEXT|Blockchain, artific...\", \"translation.zz_TR\": \"\\\"Wext|\\\"\"}", "columns": ["Domain", "Source_URL", "Target_URL", "translation_en_XX", "translation_zz_TR"], "columns_mapping": {"Domain": "Domain", "Source_URL": "Source_URL", "Target_URL": "Target_URL", "translation.en_XX": "translation_en_XX", "translation.zz_TR": "translation_zz_TR"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "sentences-zz_TR": {"config_name": "sentences-zz_TR", "sample_row": "{\"translation.en_XX\": \"\\\"YADORU KYOTO Kagami no Yado _Official Site_YADORU...\", \"translation.zz_TR\": \"\\\"Washi No YadoKagami No YadoKanade No YadoMizunoe ...\", \"LASER_similarity\": \"1.1320143\"}", "columns": ["translation_en_XX", "translation_zz_TR", "LASER_similarity"], "columns_mapping": {"translation.en_XX": "translation_en_XX", "translation.zz_TR": "translation_zz_TR", "LASER_similarity": "LASER_similarity"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "documents-tz_MA": {"config_name": "documents-tz_MA", "sample_row": "{\"Domain\": \"\\\"kasahorow.org\\\"\", \"Source_URL\": \"\\\"https://men.kasahorow.org/app/b\\\"\", \"Target_URL\": \"\\\"http://tzm.kasahorow.org/app/b\\\"\", \"translation.en_XX\": \"\\\"Read _ Mende kasahorow|Menu|Alikamisa. Saa 19, 20...\", \"translation.tz_MA\": \"\\\"Read _ Tamazight kasahorow|Menu|\\\\u2d30\\\\u2d59\\\\u2d3...\"}", "columns": ["Domain", "Source_URL", "Target_URL", "translation_en_XX", "translation_tz_MA"], "columns_mapping": {"Domain": "Domain", "Source_URL": "Source_URL", "Target_URL": "Target_URL", "translation.en_XX": "translation_en_XX", "translation.tz_MA": "translation_tz_MA"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "sentences-tz_MA": {"config_name": "sentences-tz_MA", "sample_row": "{\"translation.en_XX\": \"\\\"Tuesday 18 June 2019 _ 14:41\\\"\", \"translation.tz_MA\": \"\\\"\\\\u2d30\\\\u2d3d\\\\u2d61\\\\u2d30\\\\u2d59 14 \\\\u2d4f\\\\u2d53\\\\u2...\", \"LASER_similarity\": \"1.2042842\"}", "columns": ["translation_en_XX", "translation_tz_MA", "LASER_similarity"], "columns_mapping": {"translation.en_XX": "translation_en_XX", "translation.tz_MA": "translation_tz_MA", "LASER_similarity": "LASER_similarity"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "documents-ak_GH": {"config_name": "documents-ak_GH", "sample_row": "{\"Domain\": \"\\\"islamhouse.com\\\"\", \"Source_URL\": \"\\\"https://islamhouse.com/en/audios/373088/\\\"\", \"Target_URL\": \"\\\"https://islamhouse.com/ak/audios/373088/\\\"\", \"translation.en_XX\": \"\\\"SUMMARY in the jurisprudence of Umrah - Arabic - ...\", \"translation.ak_GH\": \"\\\"Ntwatiaa / w\\\\u0254ab\\\\u0254 no t\\\\u0254fa w\\\\u0254 m...\"}", "columns": ["Domain", "Source_URL", "Target_URL", "translation_en_XX", "translation_ak_GH"], "columns_mapping": {"Domain": "Domain", "Source_URL": "Source_URL", "Target_URL": "Target_URL", "translation.en_XX": "translation_en_XX", "translation.ak_GH": "translation_ak_GH"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}, "sentences-ak_GH": {"config_name": "sentences-ak_GH", "sample_row": "{\"translation.ak_GH\": \"\\\"Salah (nyamefere) ye Mmerebeia\\\"\", \"translation.en_XX\": \"\\\"What he dislikes when fasting (10)\\\"\", \"LASER_similarity\": \"1.4549942\"}", "columns": ["translation_ak_GH", "translation_en_XX", "LASER_similarity"], "columns_mapping": {"translation.ak_GH": "translation_ak_GH", "translation.en_XX": "translation_en_XX", "LASER_similarity": "LASER_similarity"}, "dataset_description": "CCAligned consists of parallel or comparable web-document pairs in 137 languages aligned with English. These web-document pairs were constructed by performing language identification on raw web-documents, and ensuring corresponding language codes were corresponding in the URLs of web documents. This pattern matching approach yielded more than 100 million aligned documents paired with English. Recognizing that each English document was often aligned to mulitple documents in different target language, we can join on English documents to obtain aligned documents that directly pair two non-English documents (e.g., Arabic-French).\n", "dataset_name": "ccaligned_multilingual"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:translation", "source_datasets:original", "language:af", "language:ak", "language:am", "language:ar", "language:as", "language:ay", "language:az", "language:be", "language:bg", "language:bm", "language:bn", "language:br", "language:bs", "language:ca", "language:ceb", "language:ckb", "language:cs", "language:cy", "language:de", "language:dv", "language:el", "language:eo", "language:es", "language:fa", "language:ff", "language:fi", "language:fo", "language:fr", "language:fy", "language:ga", "language:gl", "language:gn", "language:gu", "language:he", "language:hi", "language:hr", "language:hu", "language:id", "language:ig", "language:is", "language:it", "language:iu", "language:ja", "language:ka", "language:kac", "language:kg", "language:kk", "language:km", "language:kn", "language:ko", "language:ku", "language:ky", "language:la", "language:lg", "language:li", "language:ln", "language:lo", "language:lt", "language:lv", "language:mg", "language:mi", "language:mk", "language:ml", "language:mn", "language:mr", "language:ms", "language:mt", "language:my", "language:ne", "language:nl", "language:no", "language:nso", "language:ny", "language:om", "language:or", "language:pa", "language:pl", "language:ps", "language:pt", "language:rm", "language:ro", "language:ru", "language:rw", "language:sc", "language:sd", "language:se", "language:shn", "language:si", "language:sk", "language:sl", "language:sn", "language:so", "language:sq", "language:sr", "language:ss", "language:st", "language:su", "language:sv", "language:sw", "language:syc", "language:szl", "language:ta", "language:te", "language:tg", "language:th", "language:ti", "language:tl", "language:tn", "language:tr", "language:ts", "language:tt", "language:ug", "language:uk", "language:ur", "language:uz", "language:ve", "language:vi", "language:war", "language:wo", "language:xh", "language:yi", "language:yo", "language:zgh", "language:zh", "language:zu", "language:zza"], "is_gated": false}, "cdsc": {"dataset_name": "cdsc", "description": "Polish CDSCorpus consists of 10K Polish sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish. The dataset was presented at ACL 2017. Please refer to the Wr\u00f3blewska and Krasnowska-Kiera\u015b (2017) for a detailed description of the resource.", "downloads": 507, "configs": {"cdsc-e": {"config_name": "cdsc-e", "sample_row": "{\"pair_ID\": \"1\", \"sentence_A\": \"\\\"Ch\\\\u0142opiec w czerwonych trampkach skacze wysok...\", \"sentence_B\": \"\\\"Ch\\\\u0142opiec w bluzce w paski podskakuje wysoko ...\", \"entailment_judgment\": \"0\"}", "columns": ["pair_ID", "sentence_A", "sentence_B", "entailment_judgment"], "columns_mapping": {"pair_ID": "pair_ID", "sentence_A": "sentence_A", "sentence_B": "sentence_B", "entailment_judgment": "entailment_judgment"}, "dataset_description": "Polish CDSCorpus consists of 10K Polish sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish. The dataset was presented at ACL 2017. Please refer to the Wr\u00f3blewska and Krasnowska-Kiera\u015b (2017) for a detailed description of the resource.\n", "dataset_name": "cdsc"}, "cdsc-r": {"config_name": "cdsc-r", "sample_row": "{\"pair_ID\": \"1\", \"sentence_A\": \"\\\"Ch\\\\u0142opiec w czerwonych trampkach skacze wysok...\", \"sentence_B\": \"\\\"Ch\\\\u0142opiec w bluzce w paski podskakuje wysoko ...\", \"relatedness_score\": \"3.0\"}", "columns": ["pair_ID", "sentence_A", "sentence_B", "relatedness_score"], "columns_mapping": {"pair_ID": "pair_ID", "sentence_A": "sentence_A", "sentence_B": "sentence_B", "relatedness_score": "relatedness_score"}, "dataset_description": "Polish CDSCorpus consists of 10K Polish sentence pairs which are human-annotated for semantic relatedness and entailment. The dataset may be used for the evaluation of compositional distributional semantics models of Polish. The dataset was presented at ACL 2017. Please refer to the Wr\u00f3blewska and Krasnowska-Kiera\u015b (2017) for a detailed description of the resource.\n", "dataset_name": "cdsc"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl", "sentences entailment and relatedness"], "is_gated": false}, "cdt": {"dataset_name": "cdt", "description": "The Cyberbullying Detection task was part of 2019 edition of PolEval competition. The goal is to predict if a given Twitter message contains a cyberbullying (harmful) content.", "downloads": 356, "configs": {"default": {"config_name": "default", "sample_row": "{\"sentence\": \"\\\"Dla mnie faworytem do tytu\\\\u0142u b\\\\u0119dzie Cra...\", \"target\": \"0\"}", "columns": ["sentence", "target"], "columns_mapping": {"sentence": "sentence", "target": "target"}, "dataset_description": "The Cyberbullying Detection task was part of 2019 edition of PolEval competition. The goal is to predict if a given Twitter message contains a cyberbullying (harmful) content.\n", "dataset_name": "cdt"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "cedr": {"dataset_name": "cedr", "description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.", "downloads": 922, "configs": {"main": {"config_name": "main", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u0443\\\\u0440\\\\u043e\\\\u0432\\\\u044b\\\\u0439 \\\\u0433...\", \"labels\": \"[]\", \"source\": \"\\\"lj\\\"\"}", "columns": ["text", "labels", "source"], "columns_mapping": {"text": "text", "labels": "labels", "source": "source"}, "dataset_description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "dataset_name": "cedr"}, "enriched": {"config_name": "enriched", "sample_row": "{\"text\": \"\\\"\\\\u0421\\\\u0443\\\\u0440\\\\u043e\\\\u0432\\\\u044b\\\\u0439 \\\\u0433...\", \"labels\": \"[]\", \"source\": \"\\\"lj\\\"\", \"sentences\": \"[[{\\\"forma\\\": \\\"\\\\u0421\\\\u0443\\\\u0440\\\\u043e\\\\u0432\\\\u044b\\\\...\"}", "columns": ["text", "labels", "source", "sentences"], "columns_mapping": {"text": "text", "labels": "labels", "source": "source", "sentences": "sentences"}, "dataset_description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "dataset_name": "cedr"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ru", "emotion-classification"], "is_gated": false}, "circa": {"dataset_name": "circa", "description": "The Circa (meaning \u2018approximately\u2019) dataset aims to help machine learning systems\nto solve the problem of interpreting indirect answers to polar questions.\n\nThe dataset contains pairs of yes/no questions and indirect answers, together with\nannotations for the interpretation of the answer. The data is collected in 10\ndifferent social conversational situations (eg. food preferences of a friend).\n\nNOTE: There might be missing labels in the dataset and we have replaced them with -1.\nThe original dataset contains no train/dev/test splits.", "downloads": 1184, "configs": {"default": {"config_name": "default", "sample_row": "{\"context\": \"\\\"Y has just travelled from a different city to mee...\", \"question-X\": \"\\\"Are you employed?\\\"\", \"canquestion-X\": \"\\\"I am employed .\\\"\", \"answer-Y\": \"\\\"I'm a veterinary technician.\\\"\", \"judgements\": \"\\\"Yes#Yes#Yes#Yes#Yes\\\"\", \"goldstandard1\": \"0\", \"goldstandard2\": \"0\"}", "columns": ["context", "question-X", "canquestion-X", "answer-Y", "judgements", "goldstandard1", "goldstandard2"], "columns_mapping": {"context": "context", "question-X": "question-X", "canquestion-X": "canquestion-X", "answer-Y": "answer-Y", "judgements": "judgements", "goldstandard1": "goldstandard1", "goldstandard2": "goldstandard2"}, "dataset_description": "The Circa (meaning \u2018approximately\u2019) dataset aims to help machine learning systems\nto solve the problem of interpreting indirect answers to polar questions.\n\nThe dataset contains pairs of yes/no questions and indirect answers, together with\nannotations for the interpretation of the answer. The data is collected in 10\ndifferent social conversational situations (eg. food preferences of a friend).\n\nNOTE: There might be missing labels in the dataset and we have replaced them with -1.\nThe original dataset contains no train/dev/test splits.\n", "dataset_name": "circa"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "question-answer-pair-classification"], "is_gated": false}, "civil_comments": {"dataset_name": "civil_comments", "description": "The comments in this dataset come from an archive of the Civil Comments\nplatform, a commenting plugin for independent news sites. These public comments\nwere created from 2015 - 2017 and appeared on approximately 50 English-language\nnews sites across the world. When Civil Comments shut down in 2017, they chose\nto make the public comments available in a lasting open archive to enable future\nresearch. The original data, published on figshare, includes the public comment\ntext, some associated metadata such as article IDs, timestamps and\ncommenter-generated \"civility\" labels, but does not include user ids. Jigsaw\nextended this dataset by adding additional labels for toxicity and identity\nmentions. This data set is an exact replica of the data released for the\nJigsaw Unintended Bias in Toxicity Classification Kaggle challenge. This\ndataset is released under CC0, as is the underlying comment text.", "downloads": 1244, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"This is so cool. It's like, 'would you want your ...\", \"toxicity\": \"0.0\", \"severe_toxicity\": \"0.0\", \"obscene\": \"0.0\", \"threat\": \"0.0\", \"insult\": \"0.0\", \"identity_attack\": \"0.0\", \"sexual_explicit\": \"0.0\"}", "columns": ["text", "toxicity", "severe_toxicity", "obscene", "threat", "insult", "identity_attack", "sexual_explicit"], "columns_mapping": {"text": "text", "toxicity": "toxicity", "severe_toxicity": "severe_toxicity", "obscene": "obscene", "threat": "threat", "insult": "insult", "identity_attack": "identity_attack", "sexual_explicit": "sexual_explicit"}, "dataset_description": "\nThe comments in this dataset come from an archive of the Civil Comments\nplatform, a commenting plugin for independent news sites. These public comments\nwere created from 2015 - 2017 and appeared on approximately 50 English-language\nnews sites across the world. When Civil Comments shut down in 2017, they chose\nto make the public comments available in a lasting open archive to enable future\nresearch. The original data, published on figshare, includes the public comment\ntext, some associated metadata such as article IDs, timestamps and\ncommenter-generated \"civility\" labels, but does not include user ids. Jigsaw\nextended this dataset by adding additional labels for toxicity and identity\nmentions. This data set is an exact replica of the data released for the\nJigsaw Unintended Bias in Toxicity Classification Kaggle challenge. This\ndataset is released under CC0, as is the underlying comment text.\n", "dataset_name": "civil_comments"}}, "tags": ["language:en"], "is_gated": false}, "clickbait_news_bg": {"dataset_name": "clickbait_news_bg", "description": "Dataset with clickbait and fake news in Bulgarian. Introduced for the Hack the Fake News 2017.", "downloads": 345, "configs": {"default": {"config_name": "default", "sample_row": "{\"fake_news_score\": \"0\", \"click_bait_score\": \"0\", \"content_title\": \"\\\"\\\\u041a\\\\u0430\\\\u043c\\\\u0438\\\\u043b \\\\u0425\\\\u0430\\\\u0431...\", \"content_url\": \"\\\"http://a-specto.bg/kamil-habib-daesh-i-nusra-sa-n...\", \"content_published_time\": \"\\\"2017-05-17 18:35:00\\\"\", \"content\": \"\\\"\\\\u0418\\\\u043d\\\\u0442\\\\u0435\\\\u0440\\\\u0432\\\\u044e \\\\u043d...\"}", "columns": ["fake_news_score", "click_bait_score", "content_title", "content_url", "content_published_time", "content"], "columns_mapping": {"fake_news_score": "fake_news_score", "click_bait_score": "click_bait_score", "content_title": "content_title", "content_url": "content_url", "content_published_time": "content_published_time", "content": "content"}, "dataset_description": "Dataset with clickbait and fake news in Bulgarian. Introduced for the Hack the Fake News 2017.\n", "dataset_name": "clickbait_news_bg"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:bg"], "is_gated": false}, "clinc_oos": {"dataset_name": "clinc_oos", "description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".", "downloads": 2021, "configs": {"small": {"config_name": "small", "sample_row": "{\"text\": \"\\\"can you walk me through setting up direct deposit...\", \"intent\": \"108\"}", "columns": ["text", "intent"], "columns_mapping": {"text": "text", "intent": "intent"}, "dataset_description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".\n\nSmall, in which there are only 50 training queries per each in-scope intent\n", "dataset_name": "clinc_oos"}, "imbalanced": {"config_name": "imbalanced", "sample_row": "{\"text\": \"\\\"what are the steps for setting up direct deposit ...\", \"intent\": \"108\"}", "columns": ["text", "intent"], "columns_mapping": {"text": "text", "intent": "intent"}, "dataset_description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".\n\nImbalanced, in which intents have either 25, 50, 75, or 100 training queries.\n", "dataset_name": "clinc_oos"}, "plus": {"config_name": "plus", "sample_row": "{\"text\": \"\\\"what expression would i use to say i love you if ...\", \"intent\": \"61\"}", "columns": ["text", "intent"], "columns_mapping": {"text": "text", "intent": "intent"}, "dataset_description": " This dataset is for evaluating the performance of intent classification systems in the\n presence of \"out-of-scope\" queries. By \"out-of-scope\", we mean queries that do not fall\n into any of the system-supported intent classes. Most datasets include only data that is\n \"in-scope\". Our dataset includes both in-scope and out-of-scope data. You might also know\n the term \"out-of-scope\" by other terms, including \"out-of-domain\" or \"out-of-distribution\".\n\nOOS+, in which there are 250 out-of-scope training examples, rather than 100.\n", "dataset_name": "clinc_oos"}}, "tags": ["task_categories:text-classification", "task_ids:intent-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cmu_hinglish_dog": {"dataset_name": "cmu_hinglish_dog", "description": "This is a collection of text conversations in Hinglish (code mixing between Hindi-English) and their corresponding English only versions. Can be used for Translating between the two.", "downloads": 506, "configs": {"default": {"config_name": "default", "sample_row": "{\"date\": \"\\\"2018-03-21T23:01:52.359Z\\\"\", \"docIdx\": \"0\", \"translation.hi_en\": \"\\\"HELLO, KYA AAP KO MOVIES PASAND HEIN?\\\"\", \"translation.en\": \"\\\"Hello. Do you like movies?\\\"\", \"uid\": \"\\\"user2\\\"\", \"utcTimestamp\": \"\\\"2018-03-21T23:02:56.623Z\\\"\", \"rating\": \"2\", \"status\": \"1\", \"uid1LogInTime\": \"\\\"2018-03-21T23:01:52.359Z\\\"\", \"uid1LogOutTime\": \"\\\"2018-03-21T23:23:16.414Z\\\"\", \"uid1response.response\": \"[1, 2, 3, 5]\", \"uid1response.type\": \"\\\"finish\\\"\", \"uid2response.response\": \"[2, 3, 4]\", \"uid2response.type\": \"\\\"finish\\\"\", \"user2_id\": \"\\\"USR3699\\\"\", \"whoSawDoc\": \"[\\\"user1\\\", \\\"user2\\\"]\", \"wikiDocumentIdx\": \"24\"}", "columns": ["date", "docIdx", "translation_hi_en", "translation_en", "uid", "utcTimestamp", "rating", "status", "uid1LogInTime", "uid1LogOutTime", "uid1response_response", "uid1response_type", "uid2response_response", "uid2response_type", "user2_id", "whoSawDoc", "wikiDocumentIdx"], "columns_mapping": {"date": "date", "docIdx": "docIdx", "translation.hi_en": "translation_hi_en", "translation.en": "translation_en", "uid": "uid", "utcTimestamp": "utcTimestamp", "rating": "rating", "status": "status", "uid1LogInTime": "uid1LogInTime", "uid1LogOutTime": "uid1LogOutTime", "uid1response.response": "uid1response_response", "uid1response.type": "uid1response_type", "uid2response.response": "uid2response_response", "uid2response.type": "uid2response_type", "user2_id": "user2_id", "whoSawDoc": "whoSawDoc", "wikiDocumentIdx": "wikiDocumentIdx"}, "dataset_description": "This is a collection of text conversations in Hinglish (code mixing between Hindi-English) and their corresponding English only versions. Can be used for Translating between the two.\n", "dataset_name": "cmu_hinglish_dog"}}, "tags": ["task_categories:translation", "annotations_creators:machine-generated", "multilinguality:multilingual", "multilinguality:translation", "source_datasets:original", "language:en", "language:hi"], "is_gated": false}, "cnn_dailymail": {"dataset_name": "cnn_dailymail", "description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary", "downloads": 86023, "configs": {"3.0.0": {"config_name": "3.0.0", "sample_row": "{\"article\": \"\\\"LONDON, England (Reuters) -- Harry Potter star Da...\", \"highlights\": \"\\\"Harry Potter star Daniel Radcliffe gets \\\\u00a320M...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "cnn_dailymail"}, "1.0.0": {"config_name": "1.0.0", "sample_row": "{\"article\": \"\\\"LONDON, England (Reuters) -- Harry Potter star Da...\", \"highlights\": \"\\\"Harry Potter star Daniel Radcliffe gets \\\\u00a320M...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "cnn_dailymail"}, "2.0.0": {"config_name": "2.0.0", "sample_row": "{\"article\": \"\\\"LONDON, England (Reuters) -- Harry Potter star Da...\", \"highlights\": \"\\\"Harry Potter star Daniel Radcliffe gets \\\\u00a320M...\", \"id\": \"\\\"42c027e4ff9730fbb3de84c1af0d2c506e41c3e4\\\"\"}", "columns": ["article", "highlights", "id"], "columns_mapping": {"article": "article", "highlights": "highlights", "id": "id"}, "dataset_description": "CNN/DailyMail non-anonymized summarization dataset.\n\nThere are two features:\n - article: text of news article, used as the document to be summarized\n - highlights: joined text of highlights with and around each\n highlight, which is the target summary\n", "dataset_name": "cnn_dailymail"}}, "tags": ["task_categories:summarization", "task_ids:news-articles-summarization", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "coarse_discourse": {"dataset_name": "coarse_discourse", "description": "dataset contains discourse annotation and relation on threads from reddit during 2016", "downloads": 449, "configs": {"default": {"config_name": "default", "sample_row": "{\"title\": \"\\\"DTX120: #87 - Nashville\\\"\", \"is_self_post\": \"true\", \"subreddit\": \"\\\"100movies365days\\\"\", \"url\": \"\\\"https://www.reddit.com/r/100movies365days/comment...\", \"majority_link\": \"\\\"none\\\"\", \"is_first_post\": \"false\", \"majority_type\": \"\\\"announcement\\\"\", \"id_post\": \"\\\"t3_1bx6qw\\\"\", \"post_depth\": \"-1\", \"in_reply_to\": \"\\\"\\\"\", \"annotations.annotator\": \"[\\\"fc96a15ab87f02dd1998ff55a64f6478\\\", \\\"e9e4b3ab3551...\", \"annotations.link_to_post\": \"[\\\"\\\", \\\"\\\", \\\"\\\"]\", \"annotations.main_type\": \"[\\\"announcement\\\", \\\"announcement\\\", \\\"announcement\\\"]...\"}", "columns": ["title", "is_self_post", "subreddit", "url", "majority_link", "is_first_post", "majority_type", "id_post", "post_depth", "in_reply_to", "annotations_annotator", "annotations_link_to_post", "annotations_main_type"], "columns_mapping": {"title": "title", "is_self_post": "is_self_post", "subreddit": "subreddit", "url": "url", "majority_link": "majority_link", "is_first_post": "is_first_post", "majority_type": "majority_type", "id_post": "id_post", "post_depth": "post_depth", "in_reply_to": "in_reply_to", "annotations.annotator": "annotations_annotator", "annotations.link_to_post": "annotations_link_to_post", "annotations.main_type": "annotations_main_type"}, "dataset_description": "dataset contains discourse annotation and relation on threads from reddit during 2016\n", "dataset_name": "coarse_discourse"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "codah": {"dataset_name": "codah", "description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.", "downloads": 1636, "configs": {"codah": {"config_name": "codah", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"I am always very hungry before I go to bed. I am\\\"...\", \"candidate_answers\": \"[\\\"concerned that this is an illness.\\\", \\\"glad that ...\", \"correct_answer_idx\": \"3\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_0": {"config_name": "fold_0", "sample_row": "{\"id\": \"0\", \"question_category\": \"3\", \"question_propmt\": \"\\\"The chicken cannot fly. It\\\"\", \"candidate_answers\": \"[\\\"flies.\\\", \\\"spreads its wings and flies.\\\", \\\"crosse...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_1": {"config_name": "fold_1", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_2": {"config_name": "fold_2", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_3": {"config_name": "fold_3", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}, "fold_4": {"config_name": "fold_4", "sample_row": "{\"id\": \"0\", \"question_category\": \"5\", \"question_propmt\": \"\\\"Suzy reached the exam centre on time. She\\\"\", \"candidate_answers\": \"[\\\"danced her way to her room.\\\", \\\"bought tofu for t...\", \"correct_answer_idx\": \"2\"}", "columns": ["id", "question_category", "question_propmt", "candidate_answers", "correct_answer_idx"], "columns_mapping": {"id": "id", "question_category": "question_category", "question_propmt": "question_propmt", "candidate_answers": "candidate_answers", "correct_answer_idx": "correct_answer_idx"}, "dataset_description": "The COmmonsense Dataset Adversarially-authored by Humans (CODAH) is an evaluation set for commonsense question-answering in the sentence completion style of SWAG. As opposed to other automatically generated NLI datasets, CODAH is adversarially constructed by humans who can view feedback from a pre-trained model and use this information to design challenging commonsense questions. Our experimental results show that CODAH questions present a complementary extension to the SWAG dataset, testing additional modes of common sense.\n", "dataset_name": "codah"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "code_search_net": {"dataset_name": "code_search_net", "description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.", "downloads": 69261, "configs": {"all": {"config_name": "all", "sample_row": "{\"repository_name\": \"\\\"ageitgey/face_recognition\\\"\", \"func_path_in_repository\": \"\\\"examples/face_recognition_knn.py\\\"\", \"func_name\": \"\\\"train\\\"\", \"whole_func_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"language\": \"\\\"python\\\"\", \"func_code_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"func_code_tokens\": \"[\\\"def\\\", \\\"train\\\", \\\"(\\\", \\\"train_dir\\\", \\\",\\\", \\\"model_sav...\", \"func_documentation_string\": \"\\\"Trains a k-nearest neighbors classifier for face ...\", \"func_documentation_tokens\": \"[\\\"Trains\\\", \\\"a\\\", \\\"k\\\", \\\"-\\\", \\\"nearest\\\", \\\"neighbors\\\", ...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/ageitgey/face_recognition/blob...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "java": {"config_name": "java", "sample_row": "{\"repository_name\": \"\\\"spring-projects/spring-boot\\\"\", \"func_path_in_repository\": \"\\\"spring-boot-project/spring-boot/src/main/java/org...\", \"func_name\": \"\\\"IndexedElementsBinder.bindIndexed\\\"\", \"whole_func_string\": \"\\\"protected final void bindIndexed(ConfigurationPro...\", \"language\": \"\\\"java\\\"\", \"func_code_string\": \"\\\"protected final void bindIndexed(ConfigurationPro...\", \"func_code_tokens\": \"[\\\"protected\\\", \\\"final\\\", \\\"void\\\", \\\"bindIndexed\\\", \\\"(\\\",...\", \"func_documentation_string\": \"\\\"Bind indexed elements to the supplied collection....\", \"func_documentation_tokens\": \"[\\\"Bind\\\", \\\"indexed\\\", \\\"elements\\\", \\\"to\\\", \\\"the\\\", \\\"supp...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/spring-projects/spring-boot/bl...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "go": {"config_name": "go", "sample_row": "{\"repository_name\": \"\\\"kubernetes/kubernetes\\\"\", \"func_path_in_repository\": \"\\\"staging/src/k8s.io/apimachinery/pkg/runtime/exten...\", \"func_name\": \"\\\"MarshalJSON\\\"\", \"whole_func_string\": \"\\\"func (re RawExtension) MarshalJSON() ([]byte, err...\", \"language\": \"\\\"go\\\"\", \"func_code_string\": \"\\\"func (re RawExtension) MarshalJSON() ([]byte, err...\", \"func_code_tokens\": \"[\\\"func\\\", \\\"(\\\", \\\"re\\\", \\\"RawExtension\\\", \\\")\\\", \\\"MarshalJ...\", \"func_documentation_string\": \"\\\"// MarshalJSON may get called on pointers or valu...\", \"func_documentation_tokens\": \"[\\\"MarshalJSON\\\", \\\"may\\\", \\\"get\\\", \\\"called\\\", \\\"on\\\", \\\"poi...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/kubernetes/kubernetes/blob/6a8...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "python": {"config_name": "python", "sample_row": "{\"repository_name\": \"\\\"ageitgey/face_recognition\\\"\", \"func_path_in_repository\": \"\\\"examples/face_recognition_knn.py\\\"\", \"func_name\": \"\\\"train\\\"\", \"whole_func_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"language\": \"\\\"python\\\"\", \"func_code_string\": \"\\\"def train(train_dir, model_save_path=None, n_neig...\", \"func_code_tokens\": \"[\\\"def\\\", \\\"train\\\", \\\"(\\\", \\\"train_dir\\\", \\\",\\\", \\\"model_sav...\", \"func_documentation_string\": \"\\\"Trains a k-nearest neighbors classifier for face ...\", \"func_documentation_tokens\": \"[\\\"Trains\\\", \\\"a\\\", \\\"k\\\", \\\"-\\\", \\\"nearest\\\", \\\"neighbors\\\", ...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/ageitgey/face_recognition/blob...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "javascript": {"config_name": "javascript", "sample_row": "{\"repository_name\": \"\\\"Microsoft/vscode\\\"\", \"func_path_in_repository\": \"\\\"build/lib/treeshaking.js\\\"\", \"func_name\": \"\\\"createTypeScriptLanguageService\\\"\", \"whole_func_string\": \"\\\"function createTypeScriptLanguageService(options)...\", \"language\": \"\\\"javascript\\\"\", \"func_code_string\": \"\\\"function createTypeScriptLanguageService(options)...\", \"func_code_tokens\": \"[\\\"function\\\", \\\"createTypeScriptLanguageService\\\", \\\"(...\", \"func_documentation_string\": \"\\\"#region Discovery, LanguageService & Setup\\\"\", \"func_documentation_tokens\": \"[\\\"#region\\\", \\\"Discovery\\\", \\\"LanguageService\\\", \\\"&\\\", \\\"...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/Microsoft/vscode/blob/693a13cd...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "ruby": {"config_name": "ruby", "sample_row": "{\"repository_name\": \"\\\"rails/rails\\\"\", \"func_path_in_repository\": \"\\\"activesupport/lib/active_support/current_attribut...\", \"func_name\": \"\\\"ActiveSupport.CurrentAttributes.set\\\"\", \"whole_func_string\": \"\\\"def set(set_attributes)\\\\n old_attributes = c...\", \"language\": \"\\\"ruby\\\"\", \"func_code_string\": \"\\\"def set(set_attributes)\\\\n old_attributes = c...\", \"func_code_tokens\": \"[\\\"def\\\", \\\"set\\\", \\\"(\\\", \\\"set_attributes\\\", \\\")\\\", \\\"old_at...\", \"func_documentation_string\": \"\\\"Expose one or more attributes within a block. Old...\", \"func_documentation_tokens\": \"[\\\"Expose\\\", \\\"one\\\", \\\"or\\\", \\\"more\\\", \\\"attributes\\\", \\\"wit...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/rails/rails/blob/85a8bc644be69...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}, "php": {"config_name": "php", "sample_row": "{\"repository_name\": \"\\\"domnikl/DesignPatternsPHP\\\"\", \"func_path_in_repository\": \"\\\"Structural/Registry/Registry.php\\\"\", \"func_name\": \"\\\"Registry.set\\\"\", \"whole_func_string\": \"\\\"public static function set(string $key, $value)\\\\n...\", \"language\": \"\\\"php\\\"\", \"func_code_string\": \"\\\"public static function set(string $key, $value)\\\\n...\", \"func_code_tokens\": \"[\\\"public\\\", \\\"static\\\", \\\"function\\\", \\\"set\\\", \\\"(\\\", \\\"stri...\", \"func_documentation_string\": \"\\\"@param string $key\\\\n@param mixed $value\\\\n\\\\n@retu...\", \"func_documentation_tokens\": \"[\\\"@param\\\", \\\"string\\\", \\\"$key\\\", \\\"@param\\\", \\\"mixed\\\", \\\"$...\", \"split_name\": \"\\\"train\\\"\", \"func_code_url\": \"\\\"https://github.com/domnikl/DesignPatternsPHP/blob...\"}", "columns": ["repository_name", "func_path_in_repository", "func_name", "whole_func_string", "language", "func_code_string", "func_code_tokens", "func_documentation_string", "func_documentation_tokens", "split_name", "func_code_url"], "columns_mapping": {"repository_name": "repository_name", "func_path_in_repository": "func_path_in_repository", "func_name": "func_name", "whole_func_string": "whole_func_string", "language": "language", "func_code_string": "func_code_string", "func_code_tokens": "func_code_tokens", "func_documentation_string": "func_documentation_string", "func_documentation_tokens": "func_documentation_tokens", "split_name": "split_name", "func_code_url": "func_code_url"}, "dataset_description": "CodeSearchNet corpus contains about 6 million functions from open-source code spanning six programming languages (Go, Java, JavaScript, PHP, Python, and Ruby). The CodeSearchNet Corpus also contains automatically generated query-like natural language for 2 million functions, obtained from mechanically scraping and preprocessing associated function documentation.\n", "dataset_name": "code_search_net"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:no-annotation", "multilinguality:multilingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_clone_detection_big_clone_bench": {"dataset_name": "code_x_glue_cc_clone_detection_big_clone_bench", "description": "Given two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.\nThe dataset we use is BigCloneBench and filtered following the paper Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree.", "downloads": 648, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"id1\": \"13988825\", \"id2\": \"8660836\", \"func1\": \"\\\" private void setNodekeyInJsonResponse(String ...\", \"func2\": \"\\\" public void transform(String style, String sp...\", \"label\": \"false\"}", "columns": ["id", "id1", "id2", "func1", "func2", "label"], "columns_mapping": {"id": "id", "id1": "id1", "id2": "id2", "func1": "func1", "func2": "func2", "label": "label"}, "dataset_description": "CodeXGLUE Clone-detection-BigCloneBench dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Clone-detection-BigCloneBench\n\nGiven two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.\nThe dataset we use is BigCloneBench and filtered following the paper Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree.", "dataset_name": "code_x_glue_cc_clone_detection_big_clone_bench"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_cloze_testing_all": {"dataset_name": "code_x_glue_cc_cloze_testing_all", "description": "Cloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "downloads": 1031, "configs": {"go": {"config_name": "go", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"MarshalJSON\\\", \\\"supports\\\", \\\"json\\\", \\\".\\\", \\\"Marshale...\", \"pl_tokens\": \"[\\\"func\\\", \\\"(\\\", \\\"v\\\", \\\"ContextRealtimeData\\\", \\\")\\\", \\\"Ma...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "java": {"config_name": "java", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"/\\\", \\\"*\\\", \\\"(\\\", \\\"non\\\", \\\"-\\\", \\\"Javadoc\\\", \\\")\\\"]\", \"pl_tokens\": \"[\\\"@\\\", \\\"Override\\\", \\\"public\\\", \\\"int\\\", \\\"peekBit\\\", \\\"(\\\",...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "javascript": {"config_name": "javascript", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"Cast\\\", \\\"query\\\", \\\"params\\\", \\\"according\\\", \\\"to\\\", \\\"ty...\", \"pl_tokens\": \"[\\\"function\\\", \\\"castQueryParams\\\", \\\"(\\\", \\\"relId\\\", \\\",\\\",...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "php": {"config_name": "php", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"Get\\\", \\\"choices\\\", \\\".\\\"]\", \"pl_tokens\": \"[\\\"protected\\\", \\\"\\\", \\\"getChoices\\\", \\\"(\\\", \\\"FormFi...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "python": {"config_name": "python", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"Post\\\", \\\"a\\\", \\\"review\\\"]\", \"pl_tokens\": \"[\\\"def\\\", \\\"post_review\\\", \\\"(\\\", \\\"session\\\", \\\",\\\", \\\"revie...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}, "ruby": {"config_name": "ruby", "sample_row": "{\"id\": \"0\", \"idx\": \"\\\"all-1\\\"\", \"nl_tokens\": \"[\\\"By\\\", \\\"default\\\", \\\"taskers\\\", \\\"don\\\", \\\"t\\\", \\\"see\\\", \\\"t...\", \"pl_tokens\": \"[\\\"def\\\", \\\"gather_vars\\\", \\\"(\\\", \\\"executor\\\", \\\",\\\", \\\"tcon...\"}", "columns": ["id", "idx", "nl_tokens", "pl_tokens"], "columns_mapping": {"id": "id", "idx": "idx", "nl_tokens": "nl_tokens", "pl_tokens": "pl_tokens"}, "dataset_description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:slot-filling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_code_completion_line": {"dataset_name": "code_x_glue_cc_code_completion_line", "description": "Complete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "downloads": 445, "configs": {"java": {"config_name": "java", "sample_row": "{\"id\": \"0\", \"input\": \"\\\" package org . rubypeople . rdt . internal . u...\", \"gt\": \"\\\"\\\"\"}", "columns": ["id", "input", "gt"], "columns_mapping": {"id": "id", "input": "input", "gt": "gt"}, "dataset_description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line\n\nComplete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "dataset_name": "code_x_glue_cc_code_completion_line"}, "python": {"config_name": "python", "sample_row": "{\"id\": \"0\", \"input\": \"\\\" from __future__ import absolute_import ...\", \"gt\": \"\\\"\\\"\"}", "columns": ["id", "input", "gt"], "columns_mapping": {"id": "id", "input": "input", "gt": "gt"}, "dataset_description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line\n\nComplete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "dataset_name": "code_x_glue_cc_code_completion_line"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:slot-filling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_code_completion_token": {"dataset_name": "code_x_glue_cc_code_completion_token", "description": "Predict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.", "downloads": 457, "configs": {"java": {"config_name": "java", "sample_row": "{\"id\": \"0\", \"code\": \"[\\\"\\\", \\\"package\\\", \\\"org\\\", \\\".\\\", \\\"sqlproc\\\", \\\".\\\", \\\"ds...\"}", "columns": ["id", "code"], "columns_mapping": {"id": "id", "code": "code"}, "dataset_description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token\n\nPredict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.\n", "dataset_name": "code_x_glue_cc_code_completion_token"}, "python": {"config_name": "python", "sample_row": "{\"id\": \"0\", \"path\": \"\\\"00/wikihouse/urls.py\\\\n\\\"\", \"code\": \"[\\\"\\\", \\\"from\\\", \\\"bootstrap\\\", \\\"import\\\", \\\"Bootstrap\\\"...\"}", "columns": ["id", "path", "code"], "columns_mapping": {"id": "id", "path": "path", "code": "code"}, "dataset_description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token\n\nPredict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.\n", "dataset_name": "code_x_glue_cc_code_completion_token"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_cc_code_refinement": {"dataset_name": "code_x_glue_cc_code_refinement", "description": "We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "downloads": 647, "configs": {"medium": {"config_name": "medium", "sample_row": "{\"id\": \"0\", \"buggy\": \"\\\"public static TYPE_1 init ( java.lang.String name...\", \"fixed\": \"\\\"public static TYPE_1 init ( java.lang.String name...\"}", "columns": ["id", "buggy", "fixed"], "columns_mapping": {"id": "id", "buggy": "buggy", "fixed": "fixed"}, "dataset_description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement\n\nWe use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "dataset_name": "code_x_glue_cc_code_refinement"}, "small": {"config_name": "small", "sample_row": "{\"id\": \"0\", \"buggy\": \"\\\"public java.lang.String METHOD_1 ( ) { return new...\", \"fixed\": \"\\\"public java.lang.String METHOD_1 ( ) { return new...\"}", "columns": ["id", "buggy", "fixed"], "columns_mapping": {"id": "id", "buggy": "buggy", "fixed": "fixed"}, "dataset_description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement\n\nWe use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "dataset_name": "code_x_glue_cc_code_refinement"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:expert-generated", "multilinguality:other-programming-languages", "source_datasets:original", "language:code", "debugging"], "is_gated": false}, "code_x_glue_cc_code_to_code_trans": {"dataset_name": "code_x_glue_cc_code_to_code_trans", "description": "The dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).\n We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.", "downloads": 566, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"java\": \"\\\"public ListSpeechSynthesisTasksResult listSpeechS...\", \"cs\": \"\\\"public virtual ListSpeechSynthesisTasksResponse L...\"}", "columns": ["id", "java", "cs"], "columns_mapping": {"id": "id", "java": "java", "cs": "cs"}, "dataset_description": "CodeXGLUE code-to-code-trans dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-to-code-trans\n\nThe dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).\n We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.", "dataset_name": "code_x_glue_cc_code_to_code_trans"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:other-programming-languages", "source_datasets:original", "language:code", "code-to-code"], "is_gated": false}, "code_x_glue_cc_defect_detection": {"dataset_name": "code_x_glue_cc_defect_detection", "description": "Given a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code.\nThe dataset we use comes from the paper Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. We combine all projects and split 80%/10%/10% for training/dev/test.", "downloads": 526, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"func\": \"\\\"static av_cold int vdadec_init(AVCodecContext *av...\", \"target\": \"false\", \"project\": \"\\\"FFmpeg\\\"\", \"commit_id\": \"\\\"973b1a6b9070e2bf17d17568cbaf4043ce931f51\\\"\"}", "columns": ["id", "func", "target", "project", "commit_id"], "columns_mapping": {"id": "id", "func": "func", "target": "target", "project": "project", "commit_id": "commit_id"}, "dataset_description": "CodeXGLUE Defect-detection dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection\n\nGiven a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code.\nThe dataset we use comes from the paper Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. We combine all projects and split 80%/10%/10% for training/dev/test.", "dataset_name": "code_x_glue_cc_defect_detection"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:found", "multilinguality:other-programming-languages", "source_datasets:original", "language:code"], "is_gated": false}, "code_x_glue_tc_text_to_code": {"dataset_name": "code_x_glue_tc_text_to_code", "description": "We use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "downloads": 1246, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"0\", \"nl\": \"\\\"check if details are parsed . concode_field_sep C...\", \"code\": \"\\\"boolean function ( ) { return isParsed ; }\\\"\"}", "columns": ["id", "nl", "code"], "columns_mapping": {"id": "id", "nl": "nl", "code": "code"}, "dataset_description": "CodeXGLUE text-to-code dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Code/text-to-code\n\nWe use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "dataset_name": "code_x_glue_tc_text_to_code"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:other-programming-languages", "source_datasets:original", "language:code", "language:en", "text-to-code"], "is_gated": false}, "code_x_glue_tt_text_to_text": {"dataset_name": "code_x_glue_tt_text_to_text", "description": "The dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "downloads": 821, "configs": {"da_en": {"config_name": "da_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"title : " Oversigt over ops\\\\u00e6tninger for...\", \"target\": \"\\\"title : Overview of Setups for Service Items and ...\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}, "lv_en": {"config_name": "lv_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"title : Pakalpojumu objektu izveide\\\\n\\\"\", \"target\": \"\\\"title : Create service objects\\\\n\\\"\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}, "no_en": {"config_name": "no_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"title : Oversikt over oppsett av servicevarer og ...\", \"target\": \"\\\"title : Overview of Setups for Service Items and ...\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}, "zh_en": {"config_name": "zh_en", "sample_row": "{\"id\": \"0\", \"source\": \"\\\"\\\\u4ee5\\\\u4e0b \\\\u547d\\\\u540d \\\\u7a7a\\\\u95f4 \\\\u5305\\\\u54...\", \"target\": \"\\\"The following namespaces contain APIs that allow ...\"}", "columns": ["id", "source", "target"], "columns_mapping": {"id": "id", "source": "source", "target": "target"}, "dataset_description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "dataset_name": "code_x_glue_tt_text_to_text"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:da", "language:en", "language:lv", "language:nb", "language:zh", "code-documentation-translation"], "is_gated": false}, "com_qa": {"dataset_name": "com_qa", "description": "ComQA is a dataset of 11,214 questions, which were collected from WikiAnswers, a community question answering website.\nBy collecting questions from such a site we ensure that the information needs are ones of interest to actual users.\nMoreover, questions posed there are often cannot be answered by commercial search engines or QA technology, making them\nmore interesting for driving future research compared to those collected from an engine's query log. The dataset contains\nquestions with various challenging phenomena such as the need for temporal reasoning, comparison (e.g., comparatives,\nsuperlatives, ordinals), compositionality (multiple, possibly nested, subquestions with multiple entities), and\nunanswerable questions (e.g., Who was the first human being on Mars?). Through a large crowdsourcing effort, questions\nin ComQA are grouped into 4,834 paraphrase clusters that express the same information need. Each cluster is annotated\nwith its answer(s). ComQA answers come in the form of Wikipedia entities wherever possible. Wherever the answers are\ntemporal or measurable quantities, TIMEX3 and the International System of Units (SI) are used for normalization.", "downloads": 334, "configs": {"default": {"config_name": "default", "sample_row": "{\"cluster_id\": \"\\\"cluster-1754\\\"\", \"questions\": \"[\\\"what years did cale yarborough win his cup champ...\", \"answers\": \"[\\\"1976\\\", \\\"1978\\\", \\\"1977\\\"]\"}", "columns": ["cluster_id", "questions", "answers"], "columns_mapping": {"cluster_id": "cluster_id", "questions": "questions", "answers": "answers"}, "dataset_description": "ComQA is a dataset of 11,214 questions, which were collected from WikiAnswers, a community question answering website.\nBy collecting questions from such a site we ensure that the information needs are ones of interest to actual users.\nMoreover, questions posed there are often cannot be answered by commercial search engines or QA technology, making them\nmore interesting for driving future research compared to those collected from an engine's query log. The dataset contains\nquestions with various challenging phenomena such as the need for temporal reasoning, comparison (e.g., comparatives,\nsuperlatives, ordinals), compositionality (multiple, possibly nested, subquestions with multiple entities), and\nunanswerable questions (e.g., Who was the first human being on Mars?). Through a large crowdsourcing effort, questions\nin ComQA are grouped into 4,834 paraphrase clusters that express the same information need. Each cluster is annotated\nwith its answer(s). ComQA answers come in the form of Wikipedia entities wherever possible. Wherever the answers are\ntemporal or measurable quantities, TIMEX3 and the International System of Units (SI) are used for normalization.\n", "dataset_name": "com_qa"}}, "tags": ["task_categories:question-answering", "language:en"], "is_gated": false}, "common_gen": {"dataset_name": "common_gen", "description": "CommonGen is a constrained text generation task, associated with a benchmark dataset,\nto explicitly test machines for the ability of generative commonsense reasoning. Given\na set of common concepts; the task is to generate a coherent sentence describing an\neveryday scenario using these concepts.\n\nCommonGen is challenging because it inherently requires 1) relational reasoning using\nbackground commonsense knowledge, and 2) compositional generalization ability to work\non unseen concept combinations. Our dataset, constructed through a combination of\ncrowd-sourcing from AMT and existing caption corpora, consists of 30k concept-sets and\n50k sentences in total.", "downloads": 5845, "configs": {"default": {"config_name": "default", "sample_row": "{\"concept_set_idx\": \"0\", \"concepts\": \"[\\\"ski\\\", \\\"mountain\\\", \\\"skier\\\"]\", \"target\": \"\\\"Skier skis down the mountain\\\"\"}", "columns": ["concept_set_idx", "concepts", "target"], "columns_mapping": {"concept_set_idx": "concept_set_idx", "concepts": "concepts", "target": "target"}, "dataset_description": "CommonGen is a constrained text generation task, associated with a benchmark dataset,\nto explicitly test machines for the ability of generative commonsense reasoning. Given\na set of common concepts; the task is to generate a coherent sentence describing an\neveryday scenario using these concepts.\n\nCommonGen is challenging because it inherently requires 1) relational reasoning using\nbackground commonsense knowledge, and 2) compositional generalization ability to work\non unseen concept combinations. Our dataset, constructed through a combination of\ncrowd-sourcing from AMT and existing caption corpora, consists of 30k concept-sets and\n50k sentences in total.\n", "dataset_name": "common_gen"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "concepts-to-text"], "is_gated": false}, "commonsense_qa": {"dataset_name": "commonsense_qa", "description": "CommonsenseQA is a new multiple-choice question answering dataset that requires different types of commonsense knowledge\nto predict the correct answers . It contains 12,102 questions with one correct answer and four distractor answers.\nThe dataset is provided in two major training/validation/testing set splits: \"Random split\" which is the main evaluation\nsplit, and \"Question token split\", see paper for details.", "downloads": 29428, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"075e483d21c29a511267ef62bedc0461\\\"\", \"question\": \"\\\"The sanctions against the school were a punishing...\", \"question_concept\": \"\\\"punishing\\\"\", \"choices.label\": \"[\\\"A\\\", \\\"B\\\", \\\"C\\\", \\\"D\\\", \\\"E\\\"]\", \"choices.text\": \"[\\\"ignore\\\", \\\"enforce\\\", \\\"authoritarian\\\", \\\"yell at\\\", ...\", \"answerKey\": \"\\\"A\\\"\"}", "columns": ["id", "question", "question_concept", "choices_label", "choices_text", "answerKey"], "columns_mapping": {"id": "id", "question": "question", "question_concept": "question_concept", "choices.label": "choices_label", "choices.text": "choices_text", "answerKey": "answerKey"}, "dataset_description": "CommonsenseQA is a new multiple-choice question answering dataset that requires different types of commonsense knowledge\nto predict the correct answers . It contains 12,102 questions with one correct answer and four distractor answers.\nThe dataset is provided in two major training/validation/testing set splits: \"Random split\" which is the main evaluation\nsplit, and \"Question token split\", see paper for details.\n", "dataset_name": "commonsense_qa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "conceptnet5": {"dataset_name": "conceptnet5", "description": "This dataset is designed to provide training data\r\nfor common sense relationships pulls together from various sources.\r\n\r\nThe dataset is multi-lingual. See langauge codes and language info\r\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\r\n\r\n\r\nThis dataset provides an interface for the conceptnet5 csv file, and\r\nsome (but not all) of the raw text data used to build conceptnet5:\r\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\r\n\r\nOne use of this dataset would be to learn to extract the conceptnet\r\nrelationship from the omcsnet sentences.\r\n\r\nConceptnet5 has 34,074,917 relationships. Of those relationships,\r\nthere are 2,176,099 surface text sentences related to those 2M\r\nentries.\r\n\r\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\r\n2,001,736 lines.\r\n\r\nOriginal downloads are available here\r\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\r\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\r\n\r\nThe omcsnet data comes with the following warning from the authors of\r\nthe above site: Remember: this data comes from various forms of\r\ncrowdsourcing. Sentences in these files are not necessarily true,\r\nuseful, or appropriate.", "downloads": 713, "configs": {"conceptnet5": {"config_name": "conceptnet5", "sample_row": "{\"sentence\": \"\\\"\\\"\", \"full_rel\": \"\\\"/a/[/r/Antonym/,/c/ab/\\\\u0430\\\\u0433\\\\u044b\\\\u0440\\\\u0...\", \"rel\": \"\\\"/r/Antonym\\\"\", \"arg1\": \"\\\"/c/ab/\\\\u0430\\\\u0433\\\\u044b\\\\u0440\\\\u0443\\\\u0430/n\\\"\", \"arg2\": \"\\\"/c/ab/\\\\u0430\\\\u04a7\\\\u0441\\\\u0443\\\\u0430\\\"\", \"lang\": \"\\\"ab\\\"\", \"extra_info\": \"\\\"{\\\\\\\"dataset\\\\\\\": \\\\\\\"/d/wiktionary/en\\\\\\\", \\\\\\\"license\\\\\\\": ...\", \"weight\": \"1.0\"}", "columns": ["sentence", "full_rel", "rel", "arg1", "arg2", "lang", "extra_info", "weight"], "columns_mapping": {"sentence": "sentence", "full_rel": "full_rel", "rel": "rel", "arg1": "arg1", "arg2": "arg2", "lang": "lang", "extra_info": "extra_info", "weight": "weight"}, "dataset_description": "This dataset is designed to provide training data\nfor common sense relationships pulls together from various sources.\n\nThe dataset is multi-lingual. See langauge codes and language info\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\n\n\nThis dataset provides an interface for the conceptnet5 csv file, and\nsome (but not all) of the raw text data used to build conceptnet5:\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\n\nOne use of this dataset would be to learn to extract the conceptnet\nrelationship from the omcsnet sentences.\n\nConceptnet5 has 34,074,917 relationships. Of those relationships,\nthere are 2,176,099 surface text sentences related to those 2M\nentries.\n\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\n2,001,736 lines.\n\nOriginal downloads are available here\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\n\nThe omcsnet data comes with the following warning from the authors of\nthe above site: Remember: this data comes from various forms of\ncrowdsourcing. Sentences in these files are not necessarily true,\nuseful, or appropriate.\n\n", "dataset_name": "conceptnet5"}, "omcs_sentences_free": {"config_name": "omcs_sentences_free", "sample_row": "{\"sentence\": \"\\\"text\\\"\", \"raw_data\": \"\\\"id\\\\ttext\\\\tcreator_id\\\\tcreated_on\\\\tlanguage_id\\\\tac...\", \"lang\": \"\\\"language_id\\\"\"}", "columns": ["sentence", "raw_data", "lang"], "columns_mapping": {"sentence": "sentence", "raw_data": "raw_data", "lang": "lang"}, "dataset_description": "This dataset is designed to provide training data\nfor common sense relationships pulls together from various sources.\n\nThe dataset is multi-lingual. See langauge codes and language info\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\n\n\nThis dataset provides an interface for the conceptnet5 csv file, and\nsome (but not all) of the raw text data used to build conceptnet5:\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\n\nOne use of this dataset would be to learn to extract the conceptnet\nrelationship from the omcsnet sentences.\n\nConceptnet5 has 34,074,917 relationships. Of those relationships,\nthere are 2,176,099 surface text sentences related to those 2M\nentries.\n\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\n2,001,736 lines.\n\nOriginal downloads are available here\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\n\nThe omcsnet data comes with the following warning from the authors of\nthe above site: Remember: this data comes from various forms of\ncrowdsourcing. Sentences in these files are not necessarily true,\nuseful, or appropriate.\n\n", "dataset_name": "conceptnet5"}, "omcs_sentences_more": {"config_name": "omcs_sentences_more", "sample_row": "{\"sentence\": \"\\\"text\\\"\", \"raw_data\": \"\\\"id\\\\ttext\\\\tcreator_id\\\\tcreated_on\\\\tlanguage_id\\\\tac...\", \"lang\": \"\\\"language_id\\\"\"}", "columns": ["sentence", "raw_data", "lang"], "columns_mapping": {"sentence": "sentence", "raw_data": "raw_data", "lang": "lang"}, "dataset_description": "This dataset is designed to provide training data\nfor common sense relationships pulls together from various sources.\n\nThe dataset is multi-lingual. See langauge codes and language info\nhere: https://github.com/commonsense/conceptnet5/wiki/Languages\n\n\nThis dataset provides an interface for the conceptnet5 csv file, and\nsome (but not all) of the raw text data used to build conceptnet5:\nomcsnet_sentences_free.txt, and omcsnet_sentences_more.txt.\n\nOne use of this dataset would be to learn to extract the conceptnet\nrelationship from the omcsnet sentences.\n\nConceptnet5 has 34,074,917 relationships. Of those relationships,\nthere are 2,176,099 surface text sentences related to those 2M\nentries.\n\nomcsnet_sentences_free has 898,161 lines. omcsnet_sentences_more has\n2,001,736 lines.\n\nOriginal downloads are available here\nhttps://github.com/commonsense/conceptnet5/wiki/Downloads. For more\ninformation, see: https://github.com/commonsense/conceptnet5/wiki\n\nThe omcsnet data comes with the following warning from the authors of\nthe above site: Remember: this data comes from various forms of\ncrowdsourcing. Sentences in these files are not necessarily true,\nuseful, or appropriate.\n\n", "dataset_name": "conceptnet5"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:de", "language:en", "language:es", "language:fr", "language:it", "language:ja", "language:nl", "language:pt", "language:ru", "language:zh"], "is_gated": false}, "conll2000": {"dataset_name": "conll2000", "description": " Text chunking consists of dividing a text in syntactically correlated parts of words. For example, the sentence\n He reckons the current account deficit will narrow to only # 1.8 billion in September . can be divided as follows:\n[NP He ] [VP reckons ] [NP the current account deficit ] [VP will narrow ] [PP to ] [NP only # 1.8 billion ]\n[PP in ] [NP September ] .\n\nText chunking is an intermediate step towards full parsing. It was the shared task for CoNLL-2000. Training and test\ndata for this task is available. This data consists of the same partitions of the Wall Street Journal corpus (WSJ)\nas the widely used data for noun phrase chunking: sections 15-18 as training data (211727 tokens) and section 20 as\ntest data (47377 tokens). The annotation of the data has been derived from the WSJ corpus by a program written by\nSabine Buchholz from Tilburg University, The Netherlands.", "downloads": 327, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Confidence\\\", \\\"in\\\", \\\"the\\\", \\\"pound\\\", \\\"is\\\", \\\"widely...\", \"pos_tags\": \"[19, 14, 11, 19, 39, 27, 37, 32, 34, 11, 15, 19, 1...\", \"chunk_tags\": \"[11, 13, 11, 12, 21, 22, 22, 22, 22, 11, 12, 12, 1...\"}", "columns": ["id", "tokens", "pos_tags", "chunk_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "chunk_tags": "chunk_tags"}, "dataset_description": " Text chunking consists of dividing a text in syntactically correlated parts of words. For example, the sentence\n He reckons the current account deficit will narrow to only # 1.8 billion in September . can be divided as follows:\n[NP He ] [VP reckons ] [NP the current account deficit ] [VP will narrow ] [PP to ] [NP only # 1.8 billion ]\n[PP in ] [NP September ] .\n\nText chunking is an intermediate step towards full parsing. It was the shared task for CoNLL-2000. Training and test\ndata for this task is available. This data consists of the same partitions of the Wall Street Journal corpus (WSJ)\nas the widely used data for noun phrase chunking: sections 15-18 as training data (211727 tokens) and section 20 as\ntest data (47377 tokens). The annotation of the data has been derived from the WSJ corpus by a program written by\nSabine Buchholz from Tilburg University, The Netherlands.\n", "dataset_name": "conll2000"}}, "tags": ["language:en"], "is_gated": false}, "conll2002": {"dataset_name": "conll2002", "description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\n\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\n\nThe train/validation/test sets are available in Spanish and Dutch.\n\nFor more details see https://www.clips.uantwerpen.be/conll2002/ner/ and https://www.aclweb.org/anthology/W02-2024/", "downloads": 862, "configs": {"es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Melbourne\\\", \\\"(\\\", \\\"Australia\\\", \\\")\\\", \\\",\\\", \\\"25\\\", \\\"m...\", \"pos_tags\": \"[29, 21, 29, 22, 13, 59, 28, 21, 28, 22, 20]\", \"ner_tags\": \"[5, 0, 5, 0, 0, 0, 0, 0, 3, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "ner_tags": "ner_tags"}, "dataset_description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\n\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\n\nThe train/validation/test sets are available in Spanish and Dutch.\n\nFor more details see https://www.clips.uantwerpen.be/conll2002/ner/ and https://www.aclweb.org/anthology/W02-2024/\n", "dataset_name": "conll2002"}, "nl": {"config_name": "nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"De\\\", \\\"tekst\\\", \\\"van\\\", \\\"het\\\", \\\"arrest\\\", \\\"is\\\", \\\"nog...\", \"pos_tags\": \"[2, 6, 8, 2, 6, 11, 1, 1, 0, 0, 3, 2, 6, 11, 1, 11...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "pos_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "ner_tags": "ner_tags"}, "dataset_description": "Named entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\n\nThe shared task of CoNLL-2002 concerns language-independent named entity recognition.\nWe will concentrate on four types of named entities: persons, locations, organizations and names of miscellaneous entities that do not belong to the previous three groups.\nThe participants of the shared task will be offered training and test data for at least two languages.\nThey will use the data for developing a named-entity recognition system that includes a machine learning component.\nInformation sources other than the training data may be used in this shared task.\nWe are especially interested in methods that can use additional unannotated data for improving their performance (for example co-training).\n\nThe train/validation/test sets are available in Spanish and Dutch.\n\nFor more details see https://www.clips.uantwerpen.be/conll2002/ner/ and https://www.aclweb.org/anthology/W02-2024/\n", "dataset_name": "conll2002"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:crowdsourced", "multilinguality:multilingual", "source_datasets:original", "language:es", "language:nl"], "is_gated": false}, "conll2003": {"dataset_name": "conll2003", "description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419", "downloads": 77284, "configs": {"conll2003": {"config_name": "conll2003", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"EU\\\", \\\"rejects\\\", \\\"German\\\", \\\"call\\\", \\\"to\\\", \\\"boycott...\", \"pos_tags\": \"[22, 42, 16, 21, 35, 37, 16, 21, 7]\", \"chunk_tags\": \"[11, 21, 11, 12, 21, 22, 11, 12, 0]\", \"ner_tags\": \"[3, 0, 7, 0, 0, 0, 7, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "chunk_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "chunk_tags": "chunk_tags", "ner_tags": "ner_tags"}, "dataset_description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "dataset_name": "conll2003"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|other-reuters-corpus", "language:en"], "is_gated": false}, "conllpp": {"dataset_name": "conllpp", "description": "CoNLLpp is a corrected version of the CoNLL2003 NER dataset where labels of 5.38% of the sentences in the test set\nhave been manually corrected. The training set and development set are included for completeness.\nFor more details see https://www.aclweb.org/anthology/D19-1519/ and https://github.com/ZihanWangKi/CrossWeigh", "downloads": 1229, "configs": {"conllpp": {"config_name": "conllpp", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"EU\\\", \\\"rejects\\\", \\\"German\\\", \\\"call\\\", \\\"to\\\", \\\"boycott...\", \"pos_tags\": \"[22, 42, 16, 21, 35, 37, 16, 21, 7]\", \"chunk_tags\": \"[11, 21, 11, 12, 21, 22, 11, 12, 0]\", \"ner_tags\": \"[3, 0, 7, 0, 0, 0, 7, 0, 0]\"}", "columns": ["id", "tokens", "pos_tags", "chunk_tags", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "pos_tags": "pos_tags", "chunk_tags": "chunk_tags", "ner_tags": "ner_tags"}, "dataset_description": "CoNLLpp is a corrected version of the CoNLL2003 NER dataset where labels of 5.38% of the sentences in the test set\nhave been manually corrected. The training set and development set are included for completeness.\nFor more details see https://www.aclweb.org/anthology/D19-1519/ and https://github.com/ZihanWangKi/CrossWeigh\n", "dataset_name": "conllpp"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|conll2003", "language:en"], "is_gated": false}, "conv_ai_3": {"dataset_name": "conv_ai_3", "description": "The Conv AI 3 challenge is organized as part of the Search-oriented Conversational AI (SCAI) EMNLP workshop in 2020. The main aim of the conversational systems is to return an appropriate answer in response to the user requests. However, some user requests might be ambiguous. In Information Retrieval (IR) settings such a situation is handled mainly through the diversification of search result page. It is however much more challenging in dialogue settings. Hence, we aim to study the following situation for dialogue settings:\n- a user is asking an ambiguous question (where ambiguous question is a question to which one can return > 1 possible answers)\n- the system must identify that the question is ambiguous, and, instead of trying to answer it directly, ask a good clarifying question.", "downloads": 396, "configs": {"conv_ai_3": {"config_name": "conv_ai_3", "sample_row": "{\"topic_id\": \"1\", \"initial_request\": \"\\\"Tell me about Obama family tree.\\\"\", \"topic_desc\": \"\\\"Find information on President Barack Obama\\\\\\\\'s fa...\", \"clarification_need\": \"2\", \"facet_id\": \"\\\"F0001\\\"\", \"facet_desc\": \"\\\"Find the TIME magazine photo essay \\\\\\\"Barack Obama...\", \"question_id\": \"\\\"Q00384\\\"\", \"question\": \"\\\"are you interested in seeing barack obamas family...\", \"answer\": \"\\\"yes am interested in obamas family\\\"\"}", "columns": ["topic_id", "initial_request", "topic_desc", "clarification_need", "facet_id", "facet_desc", "question_id", "question", "answer"], "columns_mapping": {"topic_id": "topic_id", "initial_request": "initial_request", "topic_desc": "topic_desc", "clarification_need": "clarification_need", "facet_id": "facet_id", "facet_desc": "facet_desc", "question_id": "question_id", "question": "question", "answer": "answer"}, "dataset_description": "The Conv AI 3 challenge is organized as part of the Search-oriented Conversational AI (SCAI) EMNLP workshop in 2020. The main aim of the conversational systems is to return an appropriate answer in response to the user requests. However, some user requests might be ambiguous. In Information Retrieval (IR) settings such a situation is handled mainly through the diversification of search result page. It is however much more challenging in dialogue settings. Hence, we aim to study the following situation for dialogue settings:\n- a user is asking an ambiguous question (where ambiguous question is a question to which one can return > 1 possible answers)\n- the system must identify that the question is ambiguous, and, instead of trying to answer it directly, ask a good clarifying question.\n", "dataset_name": "conv_ai_3"}}, "tags": ["task_categories:conversational", "task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "evaluating-dialogue-systems"], "is_gated": false}, "conv_questions": {"dataset_name": "conv_questions", "description": "ConvQuestions is the first realistic benchmark for conversational question answering over knowledge graphs.\nIt contains 11,200 conversations which can be evaluated over Wikidata. The questions feature a variety of complex\nquestion phenomena like comparisons, aggregations, compositionality, and temporal reasoning.", "downloads": 332, "configs": {"default": {"config_name": "default", "sample_row": "{\"domain\": \"\\\"music\\\"\", \"seed_entity\": \"\\\"https://www.wikidata.org/wiki/Q223495\\\"\", \"seed_entity_text\": \"\\\"The Carpenters\\\"\", \"questions\": \"[\\\"When did The Carpenters sign with A&M Records?\\\",...\", \"answers\": \"[[\\\"1969\\\"], [\\\"https://www.wikidata.org/wiki/Q928282...\", \"answer_texts\": \"[\\\"1969\\\", \\\"(They Long to Be) Close to You\\\", \\\"1983\\\",...\"}", "columns": ["domain", "seed_entity", "seed_entity_text", "questions", "answers", "answer_texts"], "columns_mapping": {"domain": "domain", "seed_entity": "seed_entity", "seed_entity_text": "seed_entity_text", "questions": "questions", "answers": "answers", "answer_texts": "answer_texts"}, "dataset_description": "ConvQuestions is the first realistic benchmark for conversational question answering over knowledge graphs.\nIt contains 11,200 conversations which can be evaluated over Wikidata. The questions feature a variety of complex\nquestion phenomena like comparisons, aggregations, compositionality, and temporal reasoning.", "dataset_name": "conv_questions"}}, "tags": ["task_categories:question-answering", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:open-domain-qa", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cos_e": {"dataset_name": "cos_e", "description": "Common Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.", "downloads": 3784, "configs": {"v1.0": {"config_name": "v1.0", "sample_row": "{\"id\": \"\\\"d3b479933e716fb388dfb297e881054c\\\"\", \"question\": \"\\\"If a lantern is not for sale, where is it likely ...\", \"choices\": \"[\\\"antique shop\\\", \\\"house\\\", \\\"dark place\\\"]\", \"answer\": \"\\\"house\\\"\", \"abstractive_explanation\": \"\\\"a house is the only place that is not likely to s...\", \"extractive_explanation\": \"\\\"not for sale\\\"\"}", "columns": ["id", "question", "choices", "answer", "abstractive_explanation", "extractive_explanation"], "columns_mapping": {"id": "id", "question": "question", "choices": "choices", "answer": "answer", "abstractive_explanation": "abstractive_explanation", "extractive_explanation": "extractive_explanation"}, "dataset_description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "dataset_name": "cos_e"}, "v1.11": {"config_name": "v1.11", "sample_row": "{\"id\": \"\\\"6b819727eb8a670df26a7ffad036c119\\\"\", \"question\": \"\\\"\\\\\\\"There are 10 apples on an apple tree. Three fa...\", \"choices\": \"[\\\"park\\\", \\\"coloring book\\\", \\\"garden center\\\", \\\"math p...\", \"answer\": \"\\\"math problem\\\"\", \"abstractive_explanation\": \"\\\"webmath is designed to help you solve\\\"\", \"extractive_explanation\": \"\\\"\\\\\\\"there are 10 apples on an apple tree. three fal...\"}", "columns": ["id", "question", "choices", "answer", "abstractive_explanation", "extractive_explanation"], "columns_mapping": {"id": "id", "question": "question", "choices": "choices", "answer": "answer", "abstractive_explanation": "abstractive_explanation", "extractive_explanation": "extractive_explanation"}, "dataset_description": "\nCommon Sense Explanations (CoS-E) allows for training language models to\nautomatically generate explanations that can be used during training and\ninference in a novel Commonsense Auto-Generated Explanation (CAGE) framework.\n", "dataset_name": "cos_e"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|commonsense_qa", "language:en"], "is_gated": false}, "cosmos_qa": {"dataset_name": "cosmos_qa", "description": "Cosmos QA is a large-scale dataset of 35.6K problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. It focuses on reading between the lines over a diverse collection of people's everyday narratives, asking questions concerning on the likely causes or effects of events that require reasoning beyond the exact text spans in the context", "downloads": 18607, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"3Q9SPIIRWJKVQ8244310E8TUS6YWAC##34V1S5K3GTZMDUBNB...\", \"context\": \"\\\"Good Old War and person L : I saw both of these b...\", \"question\": \"\\\"In the future , will this person go to see other ...\", \"answer0\": \"\\\"None of the above choices .\\\"\", \"answer1\": \"\\\"This person likes music and likes to see the show...\", \"answer2\": \"\\\"This person only likes Good Old War and Person L ...\", \"answer3\": \"\\\"Other Bands is not on tour and this person can no...\", \"label\": \"1\"}", "columns": ["id", "context", "question", "answer0", "answer1", "answer2", "answer3", "label"], "columns_mapping": {"id": "id", "context": "context", "question": "question", "answer0": "answer0", "answer1": "answer1", "answer2": "answer2", "answer3": "answer3", "label": "label"}, "dataset_description": "Cosmos QA is a large-scale dataset of 35.6K problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. It focuses on reading between the lines over a diverse collection of people's everyday narratives, asking questions concerning on the likely causes or effects of events that require reasoning beyond the exact text spans in the context\n", "dataset_name": "cosmos_qa"}}, "tags": ["task_categories:multiple-choice", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "counter": {"dataset_name": "counter", "description": " The COrpus of Urdu News TExt Reuse (COUNTER) corpus contains 1200 documents with real examples of text reuse from the field of journalism. It has been manually annotated at document level with three levels of reuse: wholly derived, partially derived and non derived.", "downloads": 293, "configs": {"default": {"config_name": "default", "sample_row": "{\"source.filename\": \"\\\"0001.xml\\\"\", \"source.headline\": \"\\\"\\\\u0628\\\\u0646\\\\u06af\\\\u0627\\\\u0644 \\\\u0679\\\\u0627\\\\u0626...\", \"source.body\": \"\\\"\\\\u0688\\\\u06be\\\\u0627\\\\u06a9\\\\u06c1 \\\\u06d4 \\\\u06cc\\\\u06a...\", \"source.total_number_of_words\": \"352\", \"source.total_number_of_sentences\": \"15\", \"source.number_of_words_with_swr\": \"245\", \"source.newspaper\": \"\\\"APP\\\"\", \"source.newsdate\": \"\\\"01.12.14\\\"\", \"source.domain\": \"1\", \"source.classification\": \"1\", \"derived.filename\": \"\\\"0001p.xml\\\"\", \"derived.headline\": \"\\\"\\\\u0628\\\\u0646\\\\u06af\\\\u0644\\\\u06c1 \\\\u062f\\\\u06cc\\\\u0634...\", \"derived.body\": \"\\\"\\\\u0645\\\\u06cc\\\\u0631 \\\\u067e\\\\u0648\\\\u0631(\\\\u0648\\\\u064...\", \"derived.total_number_of_words\": \"393\", \"derived.total_number_of_sentences\": \"13\", \"derived.number_of_words_with_swr\": \"265\", \"derived.newspaper\": \"\\\"daily_waqt\\\"\", \"derived.newsdate\": \"\\\"02.12.14\\\"\", \"derived.domain\": \"1\", \"derived.classification\": \"1\"}", "columns": ["source_filename", "source_headline", "source_body", "source_total_number_of_words", "source_total_number_of_sentences", "source_number_of_words_with_swr", "source_newspaper", "source_newsdate", "source_domain", "source_classification", "derived_filename", "derived_headline", "derived_body", "derived_total_number_of_words", "derived_total_number_of_sentences", "derived_number_of_words_with_swr", "derived_newspaper", "derived_newsdate", "derived_domain", "derived_classification"], "columns_mapping": {"source.filename": "source_filename", "source.headline": "source_headline", "source.body": "source_body", "source.total_number_of_words": "source_total_number_of_words", "source.total_number_of_sentences": "source_total_number_of_sentences", "source.number_of_words_with_swr": "source_number_of_words_with_swr", "source.newspaper": "source_newspaper", "source.newsdate": "source_newsdate", "source.domain": "source_domain", "source.classification": "source_classification", "derived.filename": "derived_filename", "derived.headline": "derived_headline", "derived.body": "derived_body", "derived.total_number_of_words": "derived_total_number_of_words", "derived.total_number_of_sentences": "derived_total_number_of_sentences", "derived.number_of_words_with_swr": "derived_number_of_words_with_swr", "derived.newspaper": "derived_newspaper", "derived.newsdate": "derived_newsdate", "derived.domain": "derived_domain", "derived.classification": "derived_classification"}, "dataset_description": " The COrpus of Urdu News TExt Reuse (COUNTER) corpus contains 1200 documents with real examples of text reuse from the field of journalism. It has been manually annotated at document level with three levels of reuse: wholly derived, partially derived and non derived.\n", "dataset_name": "counter"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "task_ids:semantic-similarity-scoring", "task_ids:topic-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:ur"], "is_gated": false}, "covid_qa_castorini": {"dataset_name": "covid_qa_castorini", "description": "CovidQA is the beginnings of a question answering dataset specifically designed for COVID-19, built by hand from knowledge gathered from Kaggle's COVID-19 Open Research Dataset Challenge.", "downloads": 359, "configs": {"covid_qa_castorini": {"config_name": "covid_qa_castorini", "sample_row": "{\"category_name\": \"\\\"Incubation period\\\"\", \"question_query\": \"\\\"What is the incubation period of the virus?\\\"\", \"keyword_query\": \"\\\"Incubation period of the virus\\\"\", \"answers.id\": \"[\\\"wuclekt6\\\", \\\"e3t1f0rt\\\", \\\"ragcpbl6\\\", \\\"n0uwy77g\\\", \\\"...\", \"answers.title\": \"[\\\"Longitudinal analysis of laboratory findings dur...\", \"answers.exact_answer\": \"[\\\"4 days (IQR, 2-7)\\\", \\\"5.84 (99% CI: 4.83, 6.85) d...\"}", "columns": ["category_name", "question_query", "keyword_query", "answers_id", "answers_title", "answers_exact_answer"], "columns_mapping": {"category_name": "category_name", "question_query": "question_query", "keyword_query": "keyword_query", "answers.id": "answers_id", "answers.title": "answers_title", "answers.exact_answer": "answers_exact_answer"}, "dataset_description": "CovidQA is the beginnings of a question answering dataset specifically designed for COVID-19, built by hand from knowledge gathered from Kaggle's COVID-19 Open Research Dataset Challenge.\n", "dataset_name": "covid_qa_castorini"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "crawl_domain": {"dataset_name": "crawl_domain", "description": "Corpus of domain names scraped from Common Crawl and manually annotated to add word boundaries (e.g. \"commoncrawl\" to \"common crawl\"). Breaking domain names such as \"openresearch\" into component words \"open\" and \"research\" is important for applications such as Text-to-Speech synthesis and web search. Common Crawl is an open repository of web crawl data that can be accessed and analyzed by anyone. Specifically, we scraped the plaintext (WET) extracts for domain names from URLs that contained diverse letter casing (e.g. \"OpenBSD\"). Although in the previous example, segmentation is trivial using letter casing, this was not always the case (e.g. \"NASA\"), so we had to manually annotate the data. The dataset is stored as plaintext file where each line is an example of space separated segments of a domain name. The examples are stored in their original letter casing, but harder and more interesting examples can be generated by lowercasing the input first.", "downloads": 321, "configs": {"default": {"config_name": "default", "sample_row": "{\"example\": \"\\\"Insign is Interactive\\\"\"}", "columns": ["example"], "columns_mapping": {"example": "example"}, "dataset_description": "Corpus of domain names scraped from Common Crawl and manually annotated to add word boundaries (e.g. \"commoncrawl\" to \"common crawl\"). Breaking domain names such as \"openresearch\" into component words \"open\" and \"research\" is important for applications such as Text-to-Speech synthesis and web search. Common Crawl is an open repository of web crawl data that can be accessed and analyzed by anyone. Specifically, we scraped the plaintext (WET) extracts for domain names from URLs that contained diverse letter casing (e.g. \"OpenBSD\"). Although in the previous example, segmentation is trivial using letter casing, this was not always the case (e.g. \"NASA\"), so we had to manually annotate the data. The dataset is stored as plaintext file where each line is an example of space separated segments of a domain name. The examples are stored in their original letter casing, but harder and more interesting examples can be generated by lowercasing the input first.", "dataset_name": "crawl_domain"}}, "tags": ["task_categories:other", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other-Common-Crawl", "source_datasets:original", "language:en", "web-search", "text-to-speech"], "is_gated": false}, "crd3": {"dataset_name": "crd3", "description": "Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.", "downloads": 313, "configs": {"default": {"config_name": "default", "sample_row": "{\"chunk\": \"\\\"Matthew Mercer introduces himself and the concept...\", \"chunk_id\": \"0\", \"turn_start\": \"0\", \"turn_end\": \"0\", \"alignment_score\": \"0.0\", \"turns\": \"[{\\\"names\\\": [\\\"MATT\\\"], \\\"utterances\\\": [\\\"Hello everyon...\"}", "columns": ["chunk", "chunk_id", "turn_start", "turn_end", "alignment_score", "turns"], "columns_mapping": {"chunk": "chunk", "chunk_id": "chunk_id", "turn_start": "turn_start", "turn_end": "turn_end", "alignment_score": "alignment_score", "turns": "turns"}, "dataset_description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.\nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding\nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player\ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,\nand semantic ties to the previous dialogues.\n", "dataset_name": "crd3"}}, "tags": ["task_categories:summarization", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "cs_restaurants": {"dataset_name": "cs_restaurants", "description": "This is a dataset for NLG in task-oriented spoken dialogue systems with Czech as the target language. It originated as\na translation of the English San Francisco Restaurants dataset by Wen et al. (2015).", "downloads": 721, "configs": {"CSRestaurants": {"config_name": "CSRestaurants", "sample_row": "{\"da\": \"\\\"inform(food=Indian,good_for_meal='lunch or dinner...\", \"delex_da\": \"\\\"inform(food=X-food,good_for_meal=X-good_for_meal,...\", \"text\": \"\\\"Ko\\\\u010d\\\\u00e1r z V\\\\u00eddn\\\\u011b pod\\\\u00e1v\\\\u00e...\", \"delex_text\": \"\\\"X-name pod\\\\u00e1v\\\\u00e1 X-food pokrmy a d\\\\u00e1 s...\"}", "columns": ["da", "delex_da", "text", "delex_text"], "columns_mapping": {"da": "da", "delex_da": "delex_da", "text": "text", "delex_text": "delex_text"}, "dataset_description": "This is a dataset for NLG in task-oriented spoken dialogue systems with Czech as the target language. It originated as\na translation of the English San Francisco Restaurants dataset by Wen et al. (2015).\n", "dataset_name": "cs_restaurants"}}, "tags": ["task_categories:text2text-generation", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "task_ids:language-modeling", "task_ids:masked-language-modeling", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:extended|other-san-francisco-restaurants", "language:cs", "intent-to-text"], "is_gated": false}, "cuad": {"dataset_name": "cuad", "description": "Contract Understanding Atticus Dataset (CUAD) v1 is a corpus of more than 13,000 labels in 510\ncommercial legal contracts that have been manually labeled to identify 41 categories of important\nclauses that lawyers look for when reviewing contracts in connection with corporate transactions.", "downloads": 1065, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEME...\", \"title\": \"\\\"LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEME...\", \"context\": \"\\\"EXHIBIT 10.6\\\\n\\\\n DIS...\", \"question\": \"\\\"Highlight the parts (if any) of this contract rel...\", \"answers.text\": \"[\\\"DISTRIBUTOR AGREEMENT\\\"]\", \"answers.answer_start\": \"[44]\"}", "columns": ["id", "title", "context", "question", "answers_text", "answers_answer_start"], "columns_mapping": {"id": "id", "title": "title", "context": "context", "question": "question", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Contract Understanding Atticus Dataset (CUAD) v1 is a corpus of more than 13,000 labels in 510\ncommercial legal contracts that have been manually labeled to identify 41 categories of important\nclauses that lawyers look for when reviewing contracts in connection with corporate transactions.\n", "dataset_name": "cuad"}}, "tags": ["task_categories:question-answering", "task_ids:closed-domain-qa", "task_ids:extractive-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "curiosity_dialogs": {"dataset_name": "curiosity_dialogs", "description": "This dataset contains 14K dialogs (181K utterances) where users and assistants converse about geographic topics like\ngeopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog\nacts, grounding to Wikipedia, and user reactions to messages.", "downloads": 312, "configs": {"curiosity_dialogs": {"config_name": "curiosity_dialogs", "sample_row": "{\"messages.message\": \"[\\\"Hi. I want information about Namibia.\\\", \\\"Nmbia i...\", \"messages.liked\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\", \"messages.sender\": \"[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]\", \"messages.facts\": \"[{\\\"fid\\\": [], \\\"used\\\": [], \\\"source\\\": []}, {\\\"fid\\\": [7...\", \"messages.message_id\": \"[\\\"617343895\\\", \\\"2842515356\\\", \\\"4240816985\\\", \\\"5207110...\", \"messages.dialog_acts\": \"[[\\\"request_topic\\\"], [\\\"inform_response\\\"], [\\\"request...\", \"known_entities\": \"[\\\"South Africa\\\", \\\"United Kingdom\\\", \\\"Portugal\\\"]\", \"focus_entity\": \"\\\"Namibia\\\"\", \"dialog_id\": \"21922\", \"inferred_steps\": \"1\", \"created_time\": \"1571783665\", \"aspects\": \"[\\\"Media\\\", \\\"Politics and government\\\"]\", \"first_aspect\": \"\\\"Media\\\"\", \"second_aspect\": \"\\\"Politics and government\\\"\", \"shuffle_facts\": \"1\", \"related_entities\": \"[\\\"Western Roman Empire\\\", \\\"United Kingdom\\\", \\\"Portug...\", \"tag\": \"\\\"round_2\\\"\", \"user_id\": \"207\", \"assistant_id\": \"341\", \"is_annotated\": \"0\", \"user_dialog_rating\": \"5\", \"user_other_agent_rating\": \"5\", \"assistant_dialog_rating\": \"5\", \"assistant_other_agent_rating\": \"5\", \"reported\": \"0\", \"annotated\": \"1\"}", "columns": ["messages_message", "messages_liked", "messages_sender", "messages_facts", "messages_message_id", "messages_dialog_acts", "known_entities", "focus_entity", "dialog_id", "inferred_steps", "created_time", "aspects", "first_aspect", "second_aspect", "shuffle_facts", "related_entities", "tag", "user_id", "assistant_id", "is_annotated", "user_dialog_rating", "user_other_agent_rating", "assistant_dialog_rating", "assistant_other_agent_rating", "reported", "annotated"], "columns_mapping": {"messages.message": "messages_message", "messages.liked": "messages_liked", "messages.sender": "messages_sender", "messages.facts": "messages_facts", "messages.message_id": "messages_message_id", "messages.dialog_acts": "messages_dialog_acts", "known_entities": "known_entities", "focus_entity": "focus_entity", "dialog_id": "dialog_id", "inferred_steps": "inferred_steps", "created_time": "created_time", "aspects": "aspects", "first_aspect": "first_aspect", "second_aspect": "second_aspect", "shuffle_facts": "shuffle_facts", "related_entities": "related_entities", "tag": "tag", "user_id": "user_id", "assistant_id": "assistant_id", "is_annotated": "is_annotated", "user_dialog_rating": "user_dialog_rating", "user_other_agent_rating": "user_other_agent_rating", "assistant_dialog_rating": "assistant_dialog_rating", "assistant_other_agent_rating": "assistant_other_agent_rating", "reported": "reported", "annotated": "annotated"}, "dataset_description": "This dataset contains 14K dialogs (181K utterances) where users and assistants converse about geographic topics like\ngeopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog\nacts, grounding to Wikipedia, and user reactions to messages.\n", "dataset_name": "curiosity_dialogs"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "conversational-curiosity"], "is_gated": false}, "dane": {"dataset_name": "dane", "description": "The DaNE dataset has been annotated with Named Entities for PER, ORG and LOC\nby the Alexandra Institute.\nIt is a reannotation of the UD-DDT (Universal Dependency - Danish Dependency Treebank)\nwhich has annotations for dependency parsing and part-of-speech (POS) tagging.\nThe Danish UD treebank (Johannsen et al., 2015, UD-DDT) is a conversion of\nthe Danish Dependency Treebank (Buch-Kromann et al. 2003) based on texts\nfrom Parole (Britt, 1998).", "downloads": 857, "configs": {"default": {"config_name": "default", "sample_row": "{\"sent_id\": \"\\\"train-v2-0\\\\n\\\"\", \"text\": \"\\\"P\\\\u00e5 fredag har SID inviteret til reception i ...\", \"tok_ids\": \"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...\", \"tokens\": \"[\\\"P\\\\u00e5\\\", \\\"fredag\\\", \\\"har\\\", \\\"SID\\\", \\\"inviteret\\\", \\\"...\", \"lemmas\": \"[\\\"p\\\\u00e5\\\", \\\"fredag\\\", \\\"have\\\", \\\"SiD\\\", \\\"invitere\\\", \\\"...\", \"pos_tags\": \"[11, 12, 5, 7, 3, 11, 12, 11, 12, 11, 12, 11, 16, ...\", \"morph_tags\": \"[\\\"AdpType=Prep\\\", \\\"Definite=Ind|Gender=Com|Number=S...\", \"dep_ids\": \"[2, 5, 5, 5, 0, 7, 5, 9, 7, 11, 7, 17, 17, 17, 14,...\", \"dep_labels\": \"[35, 16, 28, 33, 19, 35, 16, 35, 18, 35, 18, 1, 1,...\", \"ner_tags\": \"[0, 0, 0, 3, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 1, 2, 0...\"}", "columns": ["sent_id", "text", "tok_ids", "tokens", "lemmas", "pos_tags", "morph_tags", "dep_ids", "dep_labels", "ner_tags"], "columns_mapping": {"sent_id": "sent_id", "text": "text", "tok_ids": "tok_ids", "tokens": "tokens", "lemmas": "lemmas", "pos_tags": "pos_tags", "morph_tags": "morph_tags", "dep_ids": "dep_ids", "dep_labels": "dep_labels", "ner_tags": "ner_tags"}, "dataset_description": "The DaNE dataset has been annotated with Named Entities for PER, ORG and LOC\nby the Alexandra Institute.\nIt is a reannotation of the UD-DDT (Universal Dependency - Danish Dependency Treebank)\nwhich has annotations for dependency parsing and part-of-speech (POS) tagging.\nThe Danish UD treebank (Johannsen et al., 2015, UD-DDT) is a conversion of\nthe Danish Dependency Treebank (Buch-Kromann et al. 2003) based on texts\nfrom Parole (Britt, 1998).\n", "dataset_name": "dane"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "task_ids:part-of-speech", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other-Danish-Universal-Dependencies-treebank", "language:da"], "is_gated": false}, "danish_political_comments": {"dataset_name": "danish_political_comments", "description": "The dataset consists of 9008 sentences that are labelled with fine-grained polarity in the range from -2 to 2 (negative to postive). The quality of the fine-grained is not cross validated and is therefore subject to uncertainties; however, the simple polarity has been cross validated and therefore is considered to be more correct.", "downloads": 362, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\"Synes i ikke det er synd for hende ja undskyld mi...\", \"target\": \"3\"}", "columns": ["id", "sentence", "target"], "columns_mapping": {"id": "id", "sentence": "sentence", "target": "target"}, "dataset_description": "The dataset consists of 9008 sentences that are labelled with fine-grained polarity in the range from -2 to 2 (negative to postive). The quality of the fine-grained is not cross validated and is therefore subject to uncertainties; however, the simple polarity has been cross validated and therefore is considered to be more correct.\n", "dataset_name": "danish_political_comments"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:da"], "is_gated": false}, "dart": {"dataset_name": "dart", "description": "DART is a large and open-domain structured DAta Record to Text generation corpus with high-quality\nsentence annotations with each input being a set of entity-relation triples following a tree-structured ontology.\nIt consists of 82191 examples across different domains with each input being a semantic RDF triple set derived\nfrom data records in tables and the tree ontology of table schema, annotated with sentence description that\ncovers all facts in the triple set.\n\nDART is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/2007.02871", "downloads": 1029, "configs": {"default": {"config_name": "default", "sample_row": "{\"tripleset\": \"[[\\\"First Clearing\\\", \\\"LOCATION\\\", \\\"On NYS 52 1 Mi. Y...\", \"subtree_was_extended\": \"false\", \"annotations.source\": \"[\\\"WikiTableQuestions_mturk\\\"]\", \"annotations.text\": \"[\\\"First Clearing\\\\tbased on Callicoon, New York and...\"}", "columns": ["tripleset", "subtree_was_extended", "annotations_source", "annotations_text"], "columns_mapping": {"tripleset": "tripleset", "subtree_was_extended": "subtree_was_extended", "annotations.source": "annotations_source", "annotations.text": "annotations_text"}, "dataset_description": "DART is a large and open-domain structured DAta Record to Text generation corpus with high-quality\nsentence annotations with each input being a set of entity-relation triples following a tree-structured ontology.\nIt consists of 82191 examples across different domains with each input being a semantic RDF triple set derived\nfrom data records in tables and the tree ontology of table schema, annotated with sentence description that\ncovers all facts in the triple set.\n\nDART is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/2007.02871\n", "dataset_name": "dart"}}, "tags": ["task_categories:tabular-to-text", "task_ids:rdf-to-text", "annotations_creators:crowdsourced", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:extended|wikitable_questions", "source_datasets:extended|wikisql", "source_datasets:extended|web_nlg", "source_datasets:extended|cleaned_e2e", "language:en"], "is_gated": false}, "dbpedia_14": {"dataset_name": "dbpedia_14", "description": "The DBpedia ontology classification dataset is constructed by picking 14 non-overlapping classes\nfrom DBpedia 2014. They are listed in classes.txt. From each of thse 14 ontology classes, we\nrandomly choose 40,000 training samples and 5,000 testing samples. Therefore, the total size\nof the training dataset is 560,000 and testing dataset 70,000.\nThere are 3 columns in the dataset (same for train and test splits), corresponding to class index\n(1 to 14), title and content. The title and content are escaped using double quotes (\"), and any\ninternal double quote is escaped by 2 double quotes (\"\"). There are no new lines in title or content.", "downloads": 6357, "configs": {"dbpedia_14": {"config_name": "dbpedia_14", "sample_row": "{\"label\": \"0\", \"title\": \"\\\"E. D. Abbott Ltd\\\"\", \"content\": \"\\\" Abbott of Farnham E D Abbott Limited was a Briti...\"}", "columns": ["label", "title", "content"], "columns_mapping": {"label": "label", "title": "title", "content": "content"}, "dataset_description": "The DBpedia ontology classification dataset is constructed by picking 14 non-overlapping classes\nfrom DBpedia 2014. They are listed in classes.txt. From each of thse 14 ontology classes, we\nrandomly choose 40,000 training samples and 5,000 testing samples. Therefore, the total size\nof the training dataset is 560,000 and testing dataset 70,000.\nThere are 3 columns in the dataset (same for train and test splits), corresponding to class index\n(1 to 14), title and content. The title and content are escaped using double quotes (\"), and any\ninternal double quote is escaped by 2 double quotes (\"\"). There are no new lines in title or content.\n", "dataset_name": "dbpedia_14"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dbrd": {"dataset_name": "dbrd", "description": "The Dutch Book Review Dataset (DBRD) contains over 110k book reviews of which 22k have associated binary sentiment polarity labels. It is intended as a benchmark for sentiment classification in Dutch and created due to a lack of annotated datasets in Dutch that are suitable for this task.", "downloads": 1256, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"text\": \"\\\"Na alle voorgaande boeken van Dan Brown gelezen t...\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The Dutch Book Review Dataset (DBRD) contains over 110k book reviews of which 22k have associated binary sentiment polarity labels. It is intended as a benchmark for sentiment classification in Dutch and created due to a lack of annotated datasets in Dutch that are suitable for this task.\n", "dataset_name": "dbrd"}}, "tags": ["task_categories:text-generation", "task_categories:fill-mask", "task_categories:text-classification", "task_ids:language-modeling", "task_ids:masked-language-modeling", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:nl"], "is_gated": false}, "deal_or_no_dialog": {"dataset_name": "deal_or_no_dialog", "description": "A large dataset of human-human negotiations on a multi-issue bargaining task, where agents who cannot observe each other\u2019s reward functions must reach anagreement (o a deal) via natural language dialogue.", "downloads": 875, "configs": {"dialogues": {"config_name": "dialogues", "sample_row": "{\"input.count\": \"[1, 4, 1]\", \"input.value\": \"[4, 1, 2]\", \"dialogue\": \"\\\"THEM: i would like 4 hats and you can have the re...\", \"output\": \"\\\"item0=1 item1=0 item2=1 item0=0 item1=4 item2=0\\\"...\", \"partner_input.count\": \"[1, 4, 1]\", \"partner_input.value\": \"[0, 2, 2]\"}", "columns": ["input_count", "input_value", "dialogue", "output", "partner_input_count", "partner_input_value"], "columns_mapping": {"input.count": "input_count", "input.value": "input_value", "dialogue": "dialogue", "output": "output", "partner_input.count": "partner_input_count", "partner_input.value": "partner_input_value"}, "dataset_description": "A large dataset of human-human negotiations on a multi-issue bargaining task, where agents who cannot observe each other\u2019s reward functions must reach anagreement (o a deal) via natural language dialogue.\n", "dataset_name": "deal_or_no_dialog"}, "self_play": {"config_name": "self_play", "sample_row": "{\"input.count\": \"[1, 1, 3]\", \"input.value\": \"[0, 1, 3]\"}", "columns": ["input_count", "input_value"], "columns_mapping": {"input.count": "input_count", "input.value": "input_value"}, "dataset_description": "A large dataset of human-human negotiations on a multi-issue bargaining task, where agents who cannot observe each other\u2019s reward functions must reach anagreement (o a deal) via natural language dialogue.\n", "dataset_name": "deal_or_no_dialog"}}, "tags": ["task_categories:conversational", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "definite_pronoun_resolution": {"dataset_name": "definite_pronoun_resolution", "description": "Composed by 30 students from one of the author's undergraduate classes. These\nsentence pairs cover topics ranging from real events (e.g., Iran's plan to\nattack the Saudi ambassador to the U.S.) to events/characters in movies (e.g.,\nBatman) and purely imaginary situations, largely reflecting the pop culture as\nperceived by the American kids born in the early 90s. Each annotated example\nspans four lines: the first line contains the sentence, the second line contains\nthe target pronoun, the third line contains the two candidate antecedents, and\nthe fourth line contains the correct antecedent. If the target pronoun appears\nmore than once in the sentence, its first occurrence is the one to be resolved.", "downloads": 338, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"sentence\": \"\\\"The bee landed on the flower because it had polle...\", \"pronoun\": \"\\\"it\\\"\", \"candidates\": \"[\\\"The bee\\\", \\\"the flower\\\"]\", \"label\": \"1\"}", "columns": ["sentence", "pronoun", "candidates", "label"], "columns_mapping": {"sentence": "sentence", "pronoun": "pronoun", "candidates": "candidates", "label": "label"}, "dataset_description": "Composed by 30 students from one of the author's undergraduate classes. These\nsentence pairs cover topics ranging from real events (e.g., Iran's plan to\nattack the Saudi ambassador to the U.S.) to events/characters in movies (e.g.,\nBatman) and purely imaginary situations, largely reflecting the pop culture as\nperceived by the American kids born in the early 90s. Each annotated example\nspans four lines: the first line contains the sentence, the second line contains\nthe target pronoun, the third line contains the two candidate antecedents, and\nthe fourth line contains the correct antecedent. If the target pronoun appears\nmore than once in the sentence, its first occurrence is the one to be resolved.\n", "dataset_name": "definite_pronoun_resolution"}}, "tags": ["task_categories:token-classification", "task_ids:word-sense-disambiguation", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dengue_filipino": {"dataset_name": "dengue_filipino", "description": " Benchmark dataset for low-resource multiclass classification, with 4,015 training, 500 testing, and 500 validation examples, each labeled as part of five classes. Each sample can be a part of multiple classes. Collected as tweets.", "downloads": 307, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"Not a good time to get sick.\\\"\", \"absent\": \"0\", \"dengue\": \"0\", \"health\": \"1\", \"mosquito\": \"0\", \"sick\": \"1\"}", "columns": ["text", "absent", "dengue", "health", "mosquito", "sick"], "columns_mapping": {"text": "text", "absent": "absent", "dengue": "dengue", "health": "health", "mosquito": "mosquito", "sick": "sick"}, "dataset_description": " Benchmark dataset for low-resource multiclass classification, with 4,015 training, 500 testing, and 500 validation examples, each labeled as part of five classes. Each sample can be a part of multiple classes. Collected as tweets.\n", "dataset_name": "dengue_filipino"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:crowdsourced", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:tl"], "is_gated": false}, "dialog_re": {"dataset_name": "dialog_re", "description": "DialogRE is the first human-annotated dialogue based relation extraction (RE) dataset aiming\nto support the prediction of relation(s) between two arguments that appear in a dialogue.\nThe dataset annotates all occurrences of 36 possible relation types that exist between pairs\nof arguments in the 1,788 dialogues originating from the complete transcripts of Friends.", "downloads": 293, "configs": {"dialog_re": {"config_name": "dialog_re", "sample_row": "{\"dialog\": \"[\\\"Speaker 1: It's been an hour and not one of my c...\", \"relation_data.x\": \"[\\\"Speaker 2\\\", \\\"Speaker 2\\\", \\\"Speaker 4\\\", \\\"Speaker 4...\", \"relation_data.y\": \"[\\\"Chandler Bing\\\", \\\"Speaker 4\\\", \\\"Tom Gordon\\\", \\\"Spea...\", \"relation_data.x_type\": \"[\\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\"]\", \"relation_data.y_type\": \"[\\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\", \\\"PER\\\"]\", \"relation_data.r\": \"[[\\\"per:alternate_names\\\"], [\\\"per:alumni\\\"], [\\\"per:al...\", \"relation_data.rid\": \"[[30], [4], [30], [4, 1], [30], [37]]\", \"relation_data.t\": \"[[\\\"\\\"], [\\\"\\\"], [\\\"\\\"], [\\\"\\\", \\\"call me\\\"], [\\\"\\\"], [\\\"\\\"]]\"}", "columns": ["dialog", "relation_data_x", "relation_data_y", "relation_data_x_type", "relation_data_y_type", "relation_data_r", "relation_data_rid", "relation_data_t"], "columns_mapping": {"dialog": "dialog", "relation_data.x": "relation_data_x", "relation_data.y": "relation_data_y", "relation_data.x_type": "relation_data_x_type", "relation_data.y_type": "relation_data_y_type", "relation_data.r": "relation_data_r", "relation_data.rid": "relation_data_rid", "relation_data.t": "relation_data_t"}, "dataset_description": "DialogRE is the first human-annotated dialogue based relation extraction (RE) dataset aiming\nto support the prediction of relation(s) between two arguments that appear in a dialogue.\nThe dataset annotates all occurrences of 36 possible relation types that exist between pairs\nof arguments in the 1,788 dialogues originating from the complete transcripts of Friends.\n", "dataset_name": "dialog_re"}}, "tags": ["task_categories:other", "task_categories:text-generation", "task_categories:fill-mask", "task_ids:dialogue-modeling", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "relation-extraction"], "is_gated": false}, "disaster_response_messages": {"dataset_name": "disaster_response_messages", "description": "This dataset contains 30,000 messages drawn from events including an earthquake in Haiti in 2010, an earthquake in Chile in 2010, floods in Pakistan in 2010, super-storm Sandy in the U.S.A. in 2012, and news articles spanning a large number of years and 100s of different disasters.\nThe data has been encoded with 36 different categories related to disaster response and has been stripped of messages with sensitive information in their entirety.\nUpon release, this is the featured dataset of a new Udacity course on Data Science and the AI4ALL summer school and is especially utile for text analytics and natural language processing (NLP) tasks and models.\nThe input data in this job contains thousands of untranslated disaster-related messages and their English translations.", "downloads": 356, "configs": {"default": {"config_name": "default", "sample_row": "{\"split\": \"\\\"train\\\"\", \"message\": \"\\\"Weather update - a cold front from Cuba that coul...\", \"original\": \"\\\"Un front froid se retrouve sur Cuba ce matin. Il ...\", \"genre\": \"\\\"direct\\\"\", \"related\": \"1\", \"PII\": \"0\", \"request\": \"0\", \"offer\": \"0\", \"aid_related\": \"0\", \"medical_help\": \"0\", \"medical_products\": \"0\", \"search_and_rescue\": \"0\", \"security\": \"0\", \"military\": \"0\", \"child_alone\": \"0\", \"water\": \"0\", \"food\": \"0\", \"shelter\": \"0\", \"clothing\": \"0\", \"money\": \"0\", \"missing_people\": \"0\", \"refugees\": \"0\", \"death\": \"0\", \"other_aid\": \"0\", \"infrastructure_related\": \"0\", \"transport\": \"0\", \"buildings\": \"0\", \"electricity\": \"0\", \"tools\": \"0\", \"hospitals\": \"0\", \"shops\": \"0\", \"aid_centers\": \"0\", \"other_infrastructure\": \"0\", \"weather_related\": \"0\", \"floods\": \"0\", \"storm\": \"0\", \"fire\": \"0\", \"earthquake\": \"0\", \"cold\": \"0\", \"other_weather\": \"0\", \"direct_report\": \"0\"}", "columns": ["split", "message", "original", "genre", "related", "PII", "request", "offer", "aid_related", "medical_help", "medical_products", "search_and_rescue", "security", "military", "child_alone", "water", "food", "shelter", "clothing", "money", "missing_people", "refugees", "death", "other_aid", "infrastructure_related", "transport", "buildings", "electricity", "tools", "hospitals", "shops", "aid_centers", "other_infrastructure", "weather_related", "floods", "storm", "fire", "earthquake", "cold", "other_weather", "direct_report"], "columns_mapping": {"split": "split", "message": "message", "original": "original", "genre": "genre", "related": "related", "PII": "PII", "request": "request", "offer": "offer", "aid_related": "aid_related", "medical_help": "medical_help", "medical_products": "medical_products", "search_and_rescue": "search_and_rescue", "security": "security", "military": "military", "child_alone": "child_alone", "water": "water", "food": "food", "shelter": "shelter", "clothing": "clothing", "money": "money", "missing_people": "missing_people", "refugees": "refugees", "death": "death", "other_aid": "other_aid", "infrastructure_related": "infrastructure_related", "transport": "transport", "buildings": "buildings", "electricity": "electricity", "tools": "tools", "hospitals": "hospitals", "shops": "shops", "aid_centers": "aid_centers", "other_infrastructure": "other_infrastructure", "weather_related": "weather_related", "floods": "floods", "storm": "storm", "fire": "fire", "earthquake": "earthquake", "cold": "cold", "other_weather": "other_weather", "direct_report": "direct_report"}, "dataset_description": "This dataset contains 30,000 messages drawn from events including an earthquake in Haiti in 2010, an earthquake in Chile in 2010, floods in Pakistan in 2010, super-storm Sandy in the U.S.A. in 2012, and news articles spanning a large number of years and 100s of different disasters.\nThe data has been encoded with 36 different categories related to disaster response and has been stripped of messages with sensitive information in their entirety.\nUpon release, this is the featured dataset of a new Udacity course on Data Science and the AI4ALL summer school and is especially utile for text analytics and natural language processing (NLP) tasks and models.\nThe input data in this job contains thousands of untranslated disaster-related messages and their English translations.\n", "dataset_name": "disaster_response_messages"}}, "tags": ["task_categories:text2text-generation", "task_categories:text-classification", "task_ids:intent-classification", "task_ids:sentiment-classification", "task_ids:text-simplification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:es", "language:fr", "language:ht", "language:ur"], "is_gated": false}, "discofuse": {"dataset_name": "discofuse", "description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.", "downloads": 546, "configs": {"discofuse-sport": {"config_name": "discofuse-sport", "sample_row": "{\"connective_string\": \"\\\"\\\"\", \"discourse_type\": \"\\\"PAIR_ANAPHORA\\\"\", \"coherent_second_sentence\": \"\\\"They have such things as video tapes , coaching s...\", \"has_coref_type_pronoun\": \"1.0\", \"incoherent_first_sentence\": \"\\\"For hockey resouces , please contact the ODCO .\\\"...\", \"incoherent_second_sentence\": \"\\\"ODCO have such things as video tapes , coaching s...\", \"has_coref_type_nominal\": \"0.0\", \"coherent_first_sentence\": \"\\\"For hockey resouces , please contact the ODCO .\\\"...\"}", "columns": ["connective_string", "discourse_type", "coherent_second_sentence", "has_coref_type_pronoun", "incoherent_first_sentence", "incoherent_second_sentence", "has_coref_type_nominal", "coherent_first_sentence"], "columns_mapping": {"connective_string": "connective_string", "discourse_type": "discourse_type", "coherent_second_sentence": "coherent_second_sentence", "has_coref_type_pronoun": "has_coref_type_pronoun", "incoherent_first_sentence": "incoherent_first_sentence", "incoherent_second_sentence": "incoherent_second_sentence", "has_coref_type_nominal": "has_coref_type_nominal", "coherent_first_sentence": "coherent_first_sentence"}, "dataset_description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "dataset_name": "discofuse"}, "discofuse-wikipedia": {"config_name": "discofuse-wikipedia", "sample_row": "{\"connective_string\": \"\\\"\\\"\", \"discourse_type\": \"\\\"PAIR_ANAPHORA\\\"\", \"coherent_second_sentence\": \"\\\"It is located in Nodaway Township .\\\"\", \"has_coref_type_pronoun\": \"1.0\", \"incoherent_first_sentence\": \"\\\"Clarinda is a city in and the county seat of Page...\", \"incoherent_second_sentence\": \"\\\"Clarinda is located in Nodaway Township .\\\"\", \"has_coref_type_nominal\": \"0.0\", \"coherent_first_sentence\": \"\\\"Clarinda is a city in and the county seat of Page...\"}", "columns": ["connective_string", "discourse_type", "coherent_second_sentence", "has_coref_type_pronoun", "incoherent_first_sentence", "incoherent_second_sentence", "has_coref_type_nominal", "coherent_first_sentence"], "columns_mapping": {"connective_string": "connective_string", "discourse_type": "discourse_type", "coherent_second_sentence": "coherent_second_sentence", "has_coref_type_pronoun": "has_coref_type_pronoun", "incoherent_first_sentence": "incoherent_first_sentence", "incoherent_second_sentence": "incoherent_second_sentence", "has_coref_type_nominal": "has_coref_type_nominal", "coherent_first_sentence": "coherent_first_sentence"}, "dataset_description": " DISCOFUSE is a large scale dataset for discourse-based sentence fusion.\n", "dataset_name": "discofuse"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "sentence-fusion"], "is_gated": false}, "disfl_qa": {"dataset_name": "disfl_qa", "description": "Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\ndataset, where each question in the dev set is annotated to add a contextual disfluency using the paragraph as\na source of distractors.\n\nThe final dataset consists of ~12k (disfluent question, answer) pairs. Over 90% of the disfluencies are\ncorrections or restarts, making it a much harder test set for disfluency correction. Disfl-QA aims to fill a\nmajor gap between speech and NLP research community. We hope the dataset can serve as a benchmark dataset for\ntesting robustness of models against disfluent inputs.\n\nOur expriments reveal that the state-of-the-art models are brittle when subjected to disfluent inputs from\nDisfl-QA. Detailed experiments and analyses can be found in our paper.", "downloads": 310, "configs": {"default": {"config_name": "default", "sample_row": "{\"squad_v2_id\": \"\\\"5a5918ff3e1742001a15cf7e\\\"\", \"original question\": \"\\\"What do unstable isotope studies indicate?\\\"\", \"disfluent question\": \"\\\"What do petrologists no what do unstable isotope ...\", \"title\": \"\\\"Geology\\\"\", \"context\": \"\\\"In addition to identifying rocks in the field, pe...\", \"answers.text\": \"[]\", \"answers.answer_start\": \"[]\"}", "columns": ["squad_v2_id", "original question", "disfluent question", "title", "context", "answers_text", "answers_answer_start"], "columns_mapping": {"squad_v2_id": "squad_v2_id", "original question": "original question", "disfluent question": "disfluent question", "title": "title", "context": "context", "answers.text": "answers_text", "answers.answer_start": "answers_answer_start"}, "dataset_description": "Disfl-QA is a targeted dataset for contextual disfluencies in an information seeking setting,\nnamely question answering over Wikipedia passages. Disfl-QA builds upon the SQuAD-v2 (Rajpurkar et al., 2018)\ndataset, where each question in the dev set is annotated to add a contextual disfluency using the paragraph as\na source of distractors.\n\nThe final dataset consists of ~12k (disfluent question, answer) pairs. Over 90% of the disfluencies are\ncorrections or restarts, making it a much harder test set for disfluency correction. Disfl-QA aims to fill a\nmajor gap between speech and NLP research community. We hope the dataset can serve as a benchmark dataset for\ntesting robustness of models against disfluent inputs.\n\nOur expriments reveal that the state-of-the-art models are brittle when subjected to disfluent inputs from\nDisfl-QA. Detailed experiments and analyses can be found in our paper.\n", "dataset_name": "disfl_qa"}}, "tags": ["task_categories:question-answering", "task_ids:extractive-qa", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dream": {"dataset_name": "dream", "description": "DREAM is a multiple-choice Dialogue-based REAding comprehension exaMination dataset. In contrast to existing reading comprehension datasets, DREAM is the first to focus on in-depth multi-turn multi-party dialogue understanding.", "downloads": 7604, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"id\": \"0\", \"dialogue_id\": \"\\\"5-510\\\"\", \"dialogue\": \"[\\\"M: I am considering dropping my dancing class. I...\", \"question\": \"\\\"What does the man suggest the woman do?\\\"\", \"choice\": \"[\\\"Consult her dancing teacher.\\\", \\\"Take a more inte...\", \"answer\": \"\\\"Continue her dancing class.\\\"\"}", "columns": ["id", "dialogue_id", "dialogue", "question", "choice", "answer"], "columns_mapping": {"id": "id", "dialogue_id": "dialogue_id", "dialogue": "dialogue", "question": "question", "choice": "choice", "answer": "answer"}, "dataset_description": "DREAM is a multiple-choice Dialogue-based REAding comprehension exaMination dataset. In contrast to existing reading comprehension datasets, DREAM is the first to focus on in-depth multi-turn multi-party dialogue understanding.\n", "dataset_name": "dream"}}, "tags": ["task_categories:question-answering", "task_ids:multiple-choice-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "drop": {"dataset_name": "drop", "description": "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs.\n. DROP is a crowdsourced, adversarially-created, 96k-question benchmark, in which a system must resolve references in a\nquestion, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or\n sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was\n necessary for prior datasets.", "downloads": 2571, "configs": {"default": {"config_name": "default", "sample_row": "{\"section_id\": \"\\\"nfl_2201\\\"\", \"query_id\": \"\\\"f16c0ee7-f131-4a8b-a6ac-4d275ea68066\\\"\", \"passage\": \"\\\"To start the season, the Lions traveled south to ...\", \"question\": \"\\\"How many points did the buccaneers need to tie in...\", \"answers_spans.spans\": \"[\\\"3\\\"]\", \"answers_spans.types\": \"[\\\"number\\\"]\"}", "columns": ["section_id", "query_id", "passage", "question", "answers_spans_spans", "answers_spans_types"], "columns_mapping": {"section_id": "section_id", "query_id": "query_id", "passage": "passage", "question": "question", "answers_spans.spans": "answers_spans_spans", "answers_spans.types": "answers_spans_types"}, "dataset_description": "DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs.\n. DROP is a crowdsourced, adversarially-created, 96k-question benchmark, in which a system must resolve references in a\nquestion, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or\n sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was\n necessary for prior datasets.\n", "dataset_name": "drop"}}, "tags": ["task_categories:question-answering", "task_categories:text2text-generation", "task_ids:extractive-qa", "task_ids:abstractive-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dutch_social": {"dataset_name": "dutch_social", "description": "The dataset contains around 271,342 tweets. The tweets are filtered via the official Twitter API to\ncontain tweets in Dutch language or by users who have specified their location information within Netherlands\ngeographical boundaries. Using natural language processing we have classified the tweets for their HISCO codes.\nIf the user has provided their location within Dutch boundaries, we have also classified them to their respective\nprovinces The objective of this dataset is to make research data available publicly in a FAIR (Findable, Accessible,\nInteroperable, Reusable) way. Twitter's Terms of Service Licensed under Attribution-NonCommercial 4.0 International\n(CC BY-NC 4.0) (2020-10-27)", "downloads": 319, "configs": {"dutch_social": {"config_name": "dutch_social", "sample_row": "{\"full_text\": \"\\\"Maar , er iets nuttigs mee doen ? Zie jij 'm vert...\", \"text_translation\": \"\\\"However, there is something useful to do with it?...\", \"screen_name\": \"\\\"RonaldMeeuwis\\\"\", \"description\": \"\\\"None\\\"\", \"desc_translation\": \"\\\"None\\\"\", \"location\": \"\\\"None\\\"\", \"weekofyear\": \"21\", \"weekday\": \"3\", \"month\": \"5\", \"year\": \"2020\", \"day\": \"21\", \"point_info\": \"\\\"\\\"\", \"point\": \"\\\"None\\\"\", \"latitude\": \"0.0\", \"longitude\": \"0.0\", \"altitude\": \"0.0\", \"province\": \"\\\"False\\\"\", \"hisco_standard\": \"\\\"None\\\"\", \"hisco_code\": \"\\\"None\\\"\", \"industry\": \"false\", \"sentiment_pattern\": \"0.0\", \"subjective_pattern\": \"0.0\", \"label\": \"1\"}", "columns": ["full_text", "text_translation", "screen_name", "description", "desc_translation", "location", "weekofyear", "weekday", "month", "year", "day", "point_info", "point", "latitude", "longitude", "altitude", "province", "hisco_standard", "hisco_code", "industry", "sentiment_pattern", "subjective_pattern", "label"], "columns_mapping": {"full_text": "full_text", "text_translation": "text_translation", "screen_name": "screen_name", "description": "description", "desc_translation": "desc_translation", "location": "location", "weekofyear": "weekofyear", "weekday": "weekday", "month": "month", "year": "year", "day": "day", "point_info": "point_info", "point": "point", "latitude": "latitude", "longitude": "longitude", "altitude": "altitude", "province": "province", "hisco_standard": "hisco_standard", "hisco_code": "hisco_code", "industry": "industry", "sentiment_pattern": "sentiment_pattern", "subjective_pattern": "subjective_pattern", "label": "label"}, "dataset_description": "The dataset contains around 271,342 tweets. The tweets are filtered via the official Twitter API to\ncontain tweets in Dutch language or by users who have specified their location information within Netherlands\ngeographical boundaries. Using natural language processing we have classified the tweets for their HISCO codes.\nIf the user has provided their location within Dutch boundaries, we have also classified them to their respective\nprovinces The objective of this dataset is to make research data available publicly in a FAIR (Findable, Accessible,\nInteroperable, Reusable) way. Twitter's Terms of Service Licensed under Attribution-NonCommercial 4.0 International\n(CC BY-NC 4.0) (2020-10-27)\n", "dataset_name": "dutch_social"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "task_ids:multi-label-classification", "annotations_creators:machine-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:nl"], "is_gated": false}, "dyk": {"dataset_name": "dyk", "description": "The Did You Know (pol. Czy wiesz?) dataset consists of human-annotated question-answer pairs. The task is to predict if the answer is correct. We chose the negatives which have the largest token overlap with a question.", "downloads": 382, "configs": {"default": {"config_name": "default", "sample_row": "{\"q_id\": \"\\\"czywiesz4068\\\"\", \"question\": \"\\\"z jakiego powodu zwo\\\\u0142ano synod w Whitby?\\\"\", \"answer\": \"\\\"W\\\\u015br\\\\u00f3d mnich\\\\u00f3w i mniszek mieszkaj\\\\u...\", \"target\": \"0\"}", "columns": ["q_id", "question", "answer", "target"], "columns_mapping": {"q_id": "q_id", "question": "question", "answer": "answer", "target": "target"}, "dataset_description": "The Did You Know (pol. Czy wiesz?) dataset consists of human-annotated question-answer pairs. The task is to predict if the answer is correct. We chose the negatives which have the largest token overlap with a question.\n", "dataset_name": "dyk"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl"], "is_gated": false}, "e2e_nlg": {"dataset_name": "e2e_nlg", "description": "The E2E dataset is used for training end-to-end, data-driven natural language generation systems in the restaurant domain, which is ten times bigger than existing, frequently used datasets in this area.\nThe E2E dataset poses new challenges:\n(1) its human reference texts show more lexical richness and syntactic variation, including discourse phenomena;\n(2) generating from this set requires content selection. As such, learning from this dataset promises more natural, varied and less template-like system utterances.\n\nE2E is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/1706.09254", "downloads": 1908, "configs": {"default": {"config_name": "default", "sample_row": "{\"meaning_representation\": \"\\\"name[The Vaults], eatType[pub], priceRange[more t...\", \"human_reference\": \"\\\"The Vaults pub near Caf\\\\u00e9 Adriatic has a 5 st...\"}", "columns": ["meaning_representation", "human_reference"], "columns_mapping": {"meaning_representation": "meaning_representation", "human_reference": "human_reference"}, "dataset_description": "The E2E dataset is used for training end-to-end, data-driven natural language generation systems in the restaurant domain, which is ten times bigger than existing, frequently used datasets in this area.\nThe E2E dataset poses new challenges:\n(1) its human reference texts show more lexical richness and syntactic variation, including discourse phenomena;\n(2) generating from this set requires content selection. As such, learning from this dataset promises more natural, varied and less template-like system utterances.\n\nE2E is released in the following paper where you can find more details and baseline results:\nhttps://arxiv.org/abs/1706.09254\n", "dataset_name": "e2e_nlg"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "meaning-representation-to-text"], "is_gated": false}, "e2e_nlg_cleaned": {"dataset_name": "e2e_nlg_cleaned", "description": "An update release of E2E NLG Challenge data with cleaned MRs and scripts, accompanying the following paper:\n\nOnd\u0159ej Du\u0161ek, David M. Howcroft, and Verena Rieser (2019): Semantic Noise Matters for Neural Natural Language Generation. In INLG, Tokyo, Japan.", "downloads": 830, "configs": {"default": {"config_name": "default", "sample_row": "{\"meaning_representation\": \"\\\"name[The Eagle], eatType[coffee shop], food[Japan...\", \"human_reference\": \"\\\"The Eagle is a low rated coffee shop near Burger ...\"}", "columns": ["meaning_representation", "human_reference"], "columns_mapping": {"meaning_representation": "meaning_representation", "human_reference": "human_reference"}, "dataset_description": "An update release of E2E NLG Challenge data with cleaned MRs and scripts, accompanying the following paper:\n\nOnd\u0159ej Du\u0161ek, David M. Howcroft, and Verena Rieser (2019): Semantic Noise Matters for Neural Natural Language Generation. In INLG, Tokyo, Japan.\n", "dataset_name": "e2e_nlg_cleaned"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "meaning-representation-to-text"], "is_gated": false}, "ecb": {"dataset_name": "ecb", "description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M", "downloads": 991, "configs": {"de-fr": {"config_name": "de-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"Navigation Path : Home > The European Central ...\", \"translation.fr\": \"\\\"Navigation Path : Home > The European Central ...\"}", "columns": ["id", "translation_de", "translation_fr"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.fr": "translation_fr"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "cs-en": {"config_name": "cs-en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"Navigation Path : Home > The European Central ...\", \"translation.en\": \"\\\"Navigation Path : Home > The European Central ...\"}", "columns": ["id", "translation_cs", "translation_en"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.en": "translation_en"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "el-it": {"config_name": "el-it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.el\": \"\\\"EL\\\"\", \"translation.it\": \"\\\"IT\\\"\"}", "columns": ["id", "translation_el", "translation_it"], "columns_mapping": {"id": "id", "translation.el": "translation_el", "translation.it": "translation_it"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "en-nl": {"config_name": "en-nl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"This message is formulated in collaboration with ...\", \"translation.nl\": \"\\\"Bijgaand bericht is opgesteld in overleg met Chri...\"}", "columns": ["id", "translation_en", "translation_nl"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.nl": "translation_nl"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}, "fi-pl": {"config_name": "fi-pl", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fi\": \"\\\"Py\\\\u00f6ristyksist\\\\u00e4 johtuen yhteenlaskut eiv...\", \"translation.pl\": \"\\\"Poszczeg\\\\u00f3lne pozycje mog\\\\u0105 nie sumowa\\\\u0...\"}", "columns": ["id", "translation_fi", "translation_pl"], "columns_mapping": {"id": "id", "translation.fi": "translation_fi", "translation.pl": "translation_pl"}, "dataset_description": "Original source: Website and documentatuion from the European Central Bank, compiled and made available by Alberto Simoes (thank you very much!)\n19 languages, 170 bitexts\ntotal number of files: 340\ntotal number of tokens: 757.37M\ntotal number of sentence fragments: 30.55M\n", "dataset_name": "ecb"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:sk", "language:sl"], "is_gated": false}, "ecthr_cases": {"dataset_name": "ecthr_cases", "description": "The ECtHR Cases dataset is designed for experimentation of neural judgment prediction and rationale extraction considering ECtHR cases.", "downloads": 1024, "configs": {"alleged-violation-prediction": {"config_name": "alleged-violation-prediction", "sample_row": "{\"facts\": \"[\\\"11. At the beginning of the events relevant to ...\", \"labels\": \"[\\\"13\\\", \\\"8\\\"]\", \"silver_rationales\": \"[1, 13, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31...\", \"gold_rationales\": \"[]\"}", "columns": ["facts", "labels", "silver_rationales", "gold_rationales"], "columns_mapping": {"facts": "facts", "labels": "labels", "silver_rationales": "silver_rationales", "gold_rationales": "gold_rationales"}, "dataset_description": "The ECtHR Cases dataset is designed for experimentation of neural judgment prediction and rationale extraction considering ECtHR cases.\n", "dataset_name": "ecthr_cases"}, "violation-prediction": {"config_name": "violation-prediction", "sample_row": "{\"facts\": \"[\\\"11. At the beginning of the events relevant to ...\", \"labels\": \"[\\\"8\\\"]\", \"silver_rationales\": \"[1, 13, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31...\"}", "columns": ["facts", "labels", "silver_rationales"], "columns_mapping": {"facts": "facts", "labels": "labels", "silver_rationales": "silver_rationales"}, "dataset_description": "The ECtHR Cases dataset is designed for experimentation of neural judgment prediction and rationale extraction considering ECtHR cases.\n", "dataset_name": "ecthr_cases"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "rationale-extraction", "legal-judgment-prediction"], "is_gated": false}, "ehealth_kd": {"dataset_name": "ehealth_kd", "description": "Dataset of the eHealth Knowledge Discovery Challenge at IberLEF 2020. It is designed for\nthe identification of semantic entities and relations in Spanish health documents.", "downloads": 290, "configs": {"ehealth_kd": {"config_name": "ehealth_kd", "sample_row": "{\"sentence\": \"\\\"En la leucemia linfoc\\\\u00edtica cr\\\\u00f3nica, hay...\", \"entities\": \"[{\\\"ent_id\\\": \\\"T1\\\", \\\"ent_text\\\": \\\"leucemia linfoc\\\\u00...\", \"relations\": \"[{\\\"rel_id\\\": \\\"R0\\\", \\\"rel_label\\\": 0, \\\"arg1\\\": \\\"T2\\\", \\\"a...\"}", "columns": ["sentence", "entities", "relations"], "columns_mapping": {"sentence": "sentence", "entities": "entities", "relations": "relations"}, "dataset_description": "Dataset of the eHealth Knowledge Discovery Challenge at IberLEF 2020. It is designed for\nthe identification of semantic entities and relations in Spanish health documents.\n", "dataset_name": "ehealth_kd"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:es", "relation-prediction"], "is_gated": false}, "eli5_category": {"dataset_name": "eli5_category", "description": "The ELI5-Category dataset is a smaller but newer and categorized version of the original ELI5 dataset. After 2017, a tagging system was introduced to this subreddit so that the questions can be categorized into different topics according to their tags. Since the training and validation set is built by questions in different topics, the dataset is expected to alleviate the train/validation overlapping issue in the original ELI5 dataset.", "downloads": 411, "configs": {"default": {"config_name": "default", "sample_row": "{\"q_id\": \"\\\"5lchat\\\"\", \"title\": \"\\\"Why there was a 'leap second' added to the end of...\", \"selftext\": \"\\\"\\\"\", \"category\": \"\\\"Other\\\"\", \"subreddit\": \"\\\"explainlikeimfive\\\"\", \"answers.a_id\": \"[\\\"dbuoyxl\\\", \\\"dbur7gi\\\", \\\"dbuotht\\\"]\", \"answers.text\": \"[\\\"the rotation of the earth is not a constant. in ...\", \"answers.score\": \"[44, 5, 4]\", \"answers.text_urls\": \"[[], [\\\"http://adminhacks.com/leap-second-bugs.html...\", \"title_urls\": \"[\\\"url\\\"]\", \"selftext_urls\": \"[\\\"url\\\"]\"}", "columns": ["q_id", "title", "selftext", "category", "subreddit", "answers_a_id", "answers_text", "answers_score", "answers_text_urls", "title_urls", "selftext_urls"], "columns_mapping": {"q_id": "q_id", "title": "title", "selftext": "selftext", "category": "category", "subreddit": "subreddit", "answers.a_id": "answers_a_id", "answers.text": "answers_text", "answers.score": "answers_score", "answers.text_urls": "answers_text_urls", "title_urls": "title_urls", "selftext_urls": "selftext_urls"}, "dataset_description": "The ELI5-Category dataset is a smaller but newer and categorized version of the original ELI5 dataset. After 2017, a tagging system was introduced to this subreddit so that the questions can be categorized into different topics according to their tags. Since the training and validation set is built by questions in different topics, the dataset is expected to alleviate the train/validation overlapping issue in the original ELI5 dataset.\n", "dataset_name": "eli5_category"}}, "tags": ["task_categories:text2text-generation", "task_ids:abstractive-qa", "task_ids:open-domain-abstractive-qa", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:extended|eli5", "language:en"], "is_gated": false}, "emea": {"dataset_name": "emea", "description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M", "downloads": 858, "configs": {"bg-el": {"config_name": "bg-el", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.bg\": \"\\\"European Medicines Agency\\\"\", \"translation.el\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_bg", "translation_el"], "columns_mapping": {"id": "id", "translation.bg": "translation_bg", "translation.el": "translation_el"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "cs-et": {"config_name": "cs-et", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.cs\": \"\\\"European Medicines Agency\\\"\", \"translation.et\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_cs", "translation_et"], "columns_mapping": {"id": "id", "translation.cs": "translation_cs", "translation.et": "translation_et"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "de-mt": {"config_name": "de-mt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.de\": \"\\\"European Medicines Agency\\\"\", \"translation.mt\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_de", "translation_mt"], "columns_mapping": {"id": "id", "translation.de": "translation_de", "translation.mt": "translation_mt"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "fr-sk": {"config_name": "fr-sk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.fr\": \"\\\"European Medicines Agency\\\"\", \"translation.sk\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_fr", "translation_sk"], "columns_mapping": {"id": "id", "translation.fr": "translation_fr", "translation.sk": "translation_sk"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}, "es-lt": {"config_name": "es-lt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.es\": \"\\\"European Medicines Agency\\\"\", \"translation.lt\": \"\\\"European Medicines Agency\\\"\"}", "columns": ["id", "translation_es", "translation_lt"], "columns_mapping": {"id": "id", "translation.es": "translation_es", "translation.lt": "translation_lt"}, "dataset_description": "This is a parallel corpus made out of PDF documents from the European Medicines Agency. All files are automatically converted from PDF to plain text using pdftotext with the command line arguments -layout -nopgbrk -eol unix. There are some known problems with tables and multi-column layouts - some of them are fixed in the current version.\n\nsource: http://www.emea.europa.eu/\n\n22 languages, 231 bitexts\ntotal number of files: 41,957\ntotal number of tokens: 311.65M\ntotal number of sentence fragments: 26.51M\n", "dataset_name": "emea"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "emo": {"dataset_name": "emo", "description": "In this dataset, given a textual dialogue i.e. an utterance along with two previous turns of context, the goal was to infer the underlying emotion of the utterance by choosing from four emotion classes - Happy, Sad, Angry and Others.", "downloads": 657, "configs": {"emo2019": {"config_name": "emo2019", "sample_row": "{\"text\": \"\\\"don't worry i'm girl hmm how do i know if you ar...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "In this dataset, given a textual dialogue i.e. an utterance along with two previous turns of context, the goal was to infer the underlying emotion of the utterance by choosing from four emotion classes - Happy, Sad, Angry and Others.\n", "dataset_name": "emo"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "dair-ai/emotion": {"dataset_name": "dair-ai/emotion", "description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.", "downloads": 22353, "configs": {"split": {"config_name": "split", "sample_row": "{\"text\": \"\\\"i didnt feel humiliated\\\"\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.\n", "dataset_name": "dair-ai/emotion"}, "unsplit": {"config_name": "unsplit", "sample_row": "{\"text\": \"\\\"i feel awful about it too because it s my job to ...\", \"label\": \"0\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.\n", "dataset_name": "dair-ai/emotion"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "emotion-classification"], "is_gated": false}, "emotone_ar": {"dataset_name": "emotone_ar", "description": "Dataset of 10065 tweets in Arabic for Emotion detection in Arabic text", "downloads": 399, "configs": {"default": {"config_name": "default", "sample_row": "{\"tweet\": \"\\\"\\\\u0627\\\\u0644\\\\u0627\\\\u0648\\\\u0644\\\\u064a\\\\u0645\\\\u0628\\\\...\", \"label\": \"0\"}", "columns": ["tweet", "label"], "columns_mapping": {"tweet": "tweet", "label": "label"}, "dataset_description": "Dataset of 10065 tweets in Arabic for Emotion detection in Arabic text", "dataset_name": "emotone_ar"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:ar"], "is_gated": false}, "empathetic_dialogues": {"dataset_name": "empathetic_dialogues", "description": "PyTorch original implementation of Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset", "downloads": 1076, "configs": {"default": {"config_name": "default", "sample_row": "{\"conv_id\": \"\\\"hit:0_conv:1\\\"\", \"utterance_idx\": \"1\", \"context\": \"\\\"sentimental\\\"\", \"prompt\": \"\\\"I remember going to the fireworks with my best fr...\", \"speaker_idx\": \"1\", \"utterance\": \"\\\"I remember going to see the fireworks with my bes...\", \"selfeval\": \"\\\"5|5|5_2|2|5\\\"\", \"tags\": \"\\\"\\\"\"}", "columns": ["conv_id", "utterance_idx", "context", "prompt", "speaker_idx", "utterance", "selfeval", "tags"], "columns_mapping": {"conv_id": "conv_id", "utterance_idx": "utterance_idx", "context": "context", "prompt": "prompt", "speaker_idx": "speaker_idx", "utterance": "utterance", "selfeval": "selfeval", "tags": "tags"}, "dataset_description": "PyTorch original implementation of Towards Empathetic Open-domain Conversation Models: a New Benchmark and Dataset\n", "dataset_name": "empathetic_dialogues"}}, "tags": ["task_categories:conversational", "task_categories:question-answering", "task_ids:dialogue-generation", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "eraser_multi_rc": {"dataset_name": "eraser_multi_rc", "description": "Eraser Multi RC is a dataset for queries over multi-line passages, along with\nanswers and a rationalte. Each example in this dataset has the following 5 parts\n1. A Mutli-line Passage\n2. A Query about the passage\n3. An Answer to the query\n4. A Classification as to whether the answer is right or wrong\n5. An Explanation justifying the classification", "downloads": 694, "configs": {"default": {"config_name": "default", "sample_row": "{\"passage\": \"\\\"As his car slid downtown on Tuesday morning the m...\", \"query_and_answer\": \"\\\"How does Mr. Thorndike act upon his impulse ? || ...\", \"label\": \"0\", \"evidences\": \"[\\\"It was these same impulses , leading so invariab...\"}", "columns": ["passage", "query_and_answer", "label", "evidences"], "columns_mapping": {"passage": "passage", "query_and_answer": "query_and_answer", "label": "label", "evidences": "evidences"}, "dataset_description": "\nEraser Multi RC is a dataset for queries over multi-line passages, along with\nanswers and a rationalte. Each example in this dataset has the following 5 parts\n1. A Mutli-line Passage\n2. A Query about the passage\n3. An Answer to the query\n4. A Classification as to whether the answer is right or wrong\n5. An Explanation justifying the classification\n", "dataset_name": "eraser_multi_rc"}}, "tags": ["task_categories:multiple-choice", "task_ids:multiple-choice-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "eth_py150_open": {"dataset_name": "eth_py150_open", "description": "A redistributable subset of the ETH Py150 corpus, introduced in the ICML 2020 paper 'Learning and Evaluating Contextual Embedding of Source Code'", "downloads": 296, "configs": {"eth_py150_open": {"config_name": "eth_py150_open", "sample_row": "{\"filepath\": \"\\\"05bit/django-smarter/example/example/settings.py\\\"...\", \"license\": \"\\\"bsd-3-clause\\\"\"}", "columns": ["filepath", "license"], "columns_mapping": {"filepath": "filepath", "license": "license"}, "dataset_description": "A redistributable subset of the ETH Py150 corpus, introduced in the ICML 2020 paper 'Learning and Evaluating Contextual Embedding of Source Code'\n", "dataset_name": "eth_py150_open"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:original", "language:en", "contextual-embeddings"], "is_gated": false}, "ethos": {"dataset_name": "ethos", "description": "ETHOS: onlinE haTe speecH detectiOn dataSet. This repository contains a dataset for hate speech\ndetection on social media platforms, called Ethos. There are two variations of the dataset:\n\nEthos_Dataset_Binary: contains 998 comments in the dataset alongside with a label\nabout hate speech presence or absence. 565 of them do not contain hate speech,\nwhile the rest of them, 433, contain.\n\nEthos_Dataset_Multi_Label: which contains 8 labels for the 433 comments with hate speech content.\nThese labels are violence (if it incites (1) or not (0) violence), directed_vs_general (if it is\ndirected to a person (1) or a group (0)), and 6 labels about the category of hate speech like,\ngender, race, national_origin, disability, religion and sexual_orientation.", "downloads": 7060, "configs": {"binary": {"config_name": "binary", "sample_row": "{\"text\": \"\\\"You should know women's sports are a joke\\\"\", \"label\": \"1\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "", "dataset_name": "ethos"}, "multilabel": {"config_name": "multilabel", "sample_row": "{\"text\": \"\\\"You should know women's sports are a joke\\\"\", \"violence\": \"0\", \"directed_vs_generalized\": \"0\", \"gender\": \"1\", \"race\": \"0\", \"national_origin\": \"0\", \"disability\": \"0\", \"religion\": \"0\", \"sexual_orientation\": \"0\"}", "columns": ["text", "violence", "directed_vs_generalized", "gender", "race", "national_origin", "disability", "religion", "sexual_orientation"], "columns_mapping": {"text": "text", "violence": "violence", "directed_vs_generalized": "directed_vs_generalized", "gender": "gender", "race": "race", "national_origin": "national_origin", "disability": "disability", "religion": "religion", "sexual_orientation": "sexual_orientation"}, "dataset_description": "", "dataset_name": "ethos"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "Hate Speech Detection"], "is_gated": false}, "eu_regulatory_ir": {"dataset_name": "eu_regulatory_ir", "description": "EURegIR: Regulatory Compliance IR (EU/UK)", "downloads": 444, "configs": {"eu2uk": {"config_name": "eu2uk", "sample_row": "{\"document_id\": \"\\\"31977L0539\\\"\", \"publication_year\": \"\\\"1977\\\"\", \"text\": \"\\\"Council Directive 77/539/EEC of 28 June 1977 on t...\", \"relevant_documents\": \"[\\\"UKSI19801182\\\"]\"}", "columns": ["document_id", "publication_year", "text", "relevant_documents"], "columns_mapping": {"document_id": "document_id", "publication_year": "publication_year", "text": "text", "relevant_documents": "relevant_documents"}, "dataset_description": "EURegIR: Regulatory Compliance IR (EU/UK)\n", "dataset_name": "eu_regulatory_ir"}, "uk2eu": {"config_name": "uk2eu", "sample_row": "{\"document_id\": \"\\\"UKPGA19700044\\\"\", \"publication_year\": \"\\\"1970\\\"\", \"text\": \"\\\"Chronically Sick and Disabled Persons Act 1970\\\\n\\\\...\", \"relevant_documents\": \"[\\\"32001L0055\\\"]\"}", "columns": ["document_id", "publication_year", "text", "relevant_documents"], "columns_mapping": {"document_id": "document_id", "publication_year": "publication_year", "text": "text", "relevant_documents": "relevant_documents"}, "dataset_description": "EURegIR: Regulatory Compliance IR (EU/UK)\n", "dataset_name": "eu_regulatory_ir"}}, "tags": ["task_categories:text-retrieval", "task_ids:document-retrieval", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "document-to-document-retrieval"], "is_gated": false}, "eurlex": {"dataset_name": "eurlex", "description": "EURLEX57K contains 57k legislative documents in English from EUR-Lex portal, annotated with EUROVOC concepts.", "downloads": 417, "configs": {"eurlex57k": {"config_name": "eurlex57k", "sample_row": "{\"celex_id\": \"\\\"32014R0727\\\"\", \"title\": \"\\\"Commission Implementing Regulation (EU) No 727/20...\", \"text\": \"\\\"1.7.2014 EN Official Journal of the European Unio...\", \"eurovoc_concepts\": \"[\\\"1402\\\", \\\"2771\\\", \\\"3191\\\", \\\"5055\\\", \\\"519\\\", \\\"5969\\\", \\\"5...\"}", "columns": ["celex_id", "title", "text", "eurovoc_concepts"], "columns_mapping": {"celex_id": "celex_id", "title": "title", "text": "text", "eurovoc_concepts": "eurovoc_concepts"}, "dataset_description": "EURLEX57K contains 57k legislative documents in English from EUR-Lex portal, annotated with EUROVOC concepts.\n", "dataset_name": "eurlex"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "annotations_creators:found", "multilinguality:monolingual", "source_datasets:original", "language:en", "legal-topic-classification"], "is_gated": false}, "euronews": {"dataset_name": "euronews", "description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.", "downloads": 965, "configs": {"fr-bnf": {"config_name": "fr-bnf", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Emmanuel\\\", \\\"DESOLES\\\", \\\"de\\\", \\\"LOU\\\", \\\"Directeur\\\", ...\", \"ner_tags\": \"[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 0, 0, 6...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "nl-kb": {"config_name": "nl-kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Indien\\\", \\\"men\\\", \\\"Itali\\\\u00eb\\\", \\\"in\\\", \\\"zijn\\\", \\\"ge...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "de-sbb": {"config_name": "de-sbb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Donnerstag\\\", \\\",\\\", \\\"1\\\", \\\".\\\", \\\"Januar\\\", \\\".\\\", \\\"Kam/...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "de-onb": {"config_name": "de-onb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"November\\\", \\\"Heute\\\", \\\"/\\\", \\\"als\\\", \\\"am\\\", \\\"Fest\\\", \\\"V...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}, "de-lft": {"config_name": "de-lft", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Eintracht\\\", \\\",\\\", \\\"die\\\", \\\"nicht\\\", \\\"nur\\\", \\\"ideal\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpora comprise of files per data provider that are encoded in the IOB format (Ramshaw & Marcus, 1995). The IOB format is a simple text chunking format that divides texts into single tokens per line, and, separated by a whitespace, tags to mark named entities. The most commonly used categories for tags are PER (person), LOC (location) and ORG (organization). To mark named entities that span multiple tokens, the tags have a prefix of either B- (beginning of named entity) or I- (inside of named entity). O (outside of named entity) tags are used to mark tokens that are not a named entity.\n", "dataset_name": "euronews"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:de", "language:fr", "language:nl"], "is_gated": false}, "europa_eac_tm": {"dataset_name": "europa_eac_tm", "description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.", "downloads": 584, "configs": {"en2bg": {"config_name": "en2bg", "sample_row": "{\"translation.en\": \"\\\"APPLICANT\\\"\", \"translation.bg\": \"\\\"\\\\u041a\\\\u0410\\\\u041d\\\\u0414\\\\u0418\\\\u0414\\\\u0410\\\\u0422\\\"...\", \"sentence_type\": \"0\"}", "columns": ["translation_en", "translation_bg", "sentence_type"], "columns_mapping": {"translation.en": "translation_en", "translation.bg": "translation_bg", "sentence_type": "sentence_type"}, "dataset_description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.\n", "dataset_name": "europa_eac_tm"}, "en2es": {"config_name": "en2es", "sample_row": "{\"translation.en\": \"\\\"Nr. teachers/trainers\\\"\", \"translation.es\": \"\\\"N\\\\u00famero de profesores/formadores\\\"\", \"sentence_type\": \"0\"}", "columns": ["translation_en", "translation_es", "sentence_type"], "columns_mapping": {"translation.en": "translation_en", "translation.es": "translation_es", "sentence_type": "sentence_type"}, "dataset_description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.\n", "dataset_name": "europa_eac_tm"}, "en2fr": {"config_name": "en2fr", "sample_row": "{\"translation.en\": \"\\\"Nr. teachers/trainers\\\"\", \"translation.fr\": \"\\\"Nb enseignants/formateurs\\\"\", \"sentence_type\": \"0\"}", "columns": ["translation_en", "translation_fr", "sentence_type"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr", "sentence_type": "sentence_type"}, "dataset_description": "In October 2012, the European Union's (EU) Directorate General for Education and Culture ( DG EAC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-six languages. This resource bears the name EAC Translation Memory, short EAC-TM.\n\nEAC-TM covers up to 26 languages: 22 official languages of the EU (all except Irish) plus Icelandic, Croatian, Norwegian and Turkish. EAC-TM thus contains translations from English into the following 25 languages: Bulgarian, Czech, Danish, Dutch, Estonian, German, Greek, Finnish, French, Croatian, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian, Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish, Swedish and Turkish.\n\nAll documents and sentences were originally written in English (source language is English) and then translated into the other languages. The texts were translated by staff of the National Agencies of the Lifelong Learning and Youth in Action programmes. They are typically professionals in the field of education/youth and EU programmes. They are thus not professional translators, but they are normally native speakers of the target language.\n", "dataset_name": "europa_eac_tm"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hr", "language:hu", "language:is", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:no", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv", "language:tr"], "is_gated": false}, "europa_ecdc_tm": {"dataset_name": "europa_ecdc_tm", "description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "downloads": 571, "configs": {"en2bg": {"config_name": "en2bg", "sample_row": "{\"translation.en\": \"\\\"Vaccination against hepatitis C is not yet availa...\", \"translation.bg\": \"\\\"\\\\u0417\\\\u0430\\\\u0441\\\\u0435\\\\u0433\\\\u0430 \\\\u043d\\\\u044f...\"}", "columns": ["translation_en", "translation_bg"], "columns_mapping": {"translation.en": "translation_en", "translation.bg": "translation_bg"}, "dataset_description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "dataset_name": "europa_ecdc_tm"}, "en2fr": {"config_name": "en2fr", "sample_row": "{\"translation.en\": \"\\\"Vaccination against hepatitis C is not yet availa...\", \"translation.fr\": \"\\\"Aucune vaccination contre l\\\\u2019h\\\\u00e9patite C ...\"}", "columns": ["translation_en", "translation_fr"], "columns_mapping": {"translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "dataset_name": "europa_ecdc_tm"}, "en2sl": {"config_name": "en2sl", "sample_row": "{\"translation.en\": \"\\\"Vaccination against hepatitis C is not yet availa...\", \"translation.sl\": \"\\\"Cepiva proti hepatitisu C \\\\u0161e ni.\\\"\"}", "columns": ["translation_en", "translation_sl"], "columns_mapping": {"translation.en": "translation_en", "translation.sl": "translation_sl"}, "dataset_description": "In October 2012, the European Union (EU) agency 'European Centre for Disease Prevention and Control' (ECDC) released a translation memory (TM), i.e. a collection of sentences and their professionally produced translations, in twenty-five languages. This resource bears the name EAC Translation Memory, short EAC-TM.\nECDC-TM covers 25 languages: the 23 official languages of the EU plus Norwegian (Norsk) and Icelandic. ECDC-TM was created by translating from English into the following 24 languages: Bulgarian, Czech, Danish, Dutch, English, Estonian, Gaelige (Irish), German, Greek, Finnish, French, Hungarian, Icelandic, Italian, Latvian, Lithuanian, Maltese, Norwegian (NOrsk), Polish, Portuguese, Romanian, Slovak, Slovenian, Spanish and Swedish.\nAll documents and sentences were thus originally written in English. They were then translated into the other languages by professional translators from the Translation Centre CdT in Luxembourg.", "dataset_name": "europa_ecdc_tm"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:ga", "language:hu", "language:is", "language:it", "language:lt", "language:lv", "language:mt", "language:nl", "language:no", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "europarl_bilingual": {"dataset_name": "europarl_bilingual", "description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.", "downloads": 990, "configs": {"bg-cs": {"config_name": "bg-cs", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.cs\": \"\\\"Slo\\\\u017een\\\\u00ed Parlamentu: viz z\\\\u00e1pis\\\"\"}", "columns": ["translation_bg", "translation_cs"], "columns_mapping": {"translation.bg": "translation_bg", "translation.cs": "translation_cs"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-da": {"config_name": "bg-da", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.da\": \"\\\"Parlamentets sammens\\\\u00e6tning: se protokollen\\\"...\"}", "columns": ["translation_bg", "translation_da"], "columns_mapping": {"translation.bg": "translation_bg", "translation.da": "translation_da"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-de": {"config_name": "bg-de", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.de\": \"\\\"Zusammensetzung des Parlaments: siehe Protokoll\\\"...\"}", "columns": ["translation_bg", "translation_de"], "columns_mapping": {"translation.bg": "translation_bg", "translation.de": "translation_de"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-el": {"config_name": "bg-el", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.el\": \"\\\"\\\\u03a3\\\\u03cd\\\\u03bd\\\\u03b8\\\\u03b5\\\\u03c3\\\\u03b7 \\\\u03c4...\"}", "columns": ["translation_bg", "translation_el"], "columns_mapping": {"translation.bg": "translation_bg", "translation.el": "translation_el"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}, "bg-en": {"config_name": "bg-en", "sample_row": "{\"translation.bg\": \"\\\"\\\\u0421\\\\u044a\\\\u0441\\\\u0442\\\\u0430\\\\u0432 \\\\u043d\\\\u0430...\", \"translation.en\": \"\\\"Membership of Parliament: see Minutes\\\"\"}", "columns": ["translation_bg", "translation_en"], "columns_mapping": {"translation.bg": "translation_bg", "translation.en": "translation_en"}, "dataset_description": "A parallel corpus extracted from the European Parliament web site by Philipp Koehn (University of Edinburgh). The main intended use is to aid statistical machine translation research.\n", "dataset_name": "europarl_bilingual"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:translation", "source_datasets:original", "language:bg", "language:cs", "language:da", "language:de", "language:el", "language:en", "language:es", "language:et", "language:fi", "language:fr", "language:hu", "language:it", "language:lt", "language:lv", "language:nl", "language:pl", "language:pt", "language:ro", "language:sk", "language:sl", "language:sv"], "is_gated": false}, "event2Mind": {"dataset_name": "event2Mind", "description": "In Event2Mind, we explore the task of understanding stereotypical intents and reactions to events. Through crowdsourcing, we create a large corpus with 25,000 events and free-form descriptions of their intents and reactions, both of the event's subject and (potentially implied) other participants.", "downloads": 316, "configs": {"default": {"config_name": "default", "sample_row": "{\"Source\": \"\\\"it_events\\\"\", \"Event\": \"\\\"It is PersonY's favorite color\\\"\", \"Xintent\": \"\\\"[\\\\\\\"none\\\\\\\"]\\\"\", \"Xemotion\": \"\\\"[\\\\\\\"none\\\\\\\"]\\\"\", \"Otheremotion\": \"\\\"[\\\\\\\"happy\\\\\\\"]\\\"\", \"Xsent\": \"\\\"\\\"\", \"Osent\": \"\\\"4.0\\\"\"}", "columns": ["Source", "Event", "Xintent", "Xemotion", "Otheremotion", "Xsent", "Osent"], "columns_mapping": {"Source": "Source", "Event": "Event", "Xintent": "Xintent", "Xemotion": "Xemotion", "Otheremotion": "Otheremotion", "Xsent": "Xsent", "Osent": "Osent"}, "dataset_description": "In Event2Mind, we explore the task of understanding stereotypical intents and reactions to events. Through crowdsourcing, we create a large corpus with 25,000 events and free-form descriptions of their intents and reactions, both of the event's subject and (potentially implied) other participants.\n", "dataset_name": "event2Mind"}}, "tags": ["task_categories:text2text-generation", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "common-sense-inference"], "is_gated": false}, "factckbr": {"dataset_name": "factckbr", "description": "A dataset to study Fake News in Portuguese, presenting a supposedly false News along with their respective fact check and classification.\nThe data is collected from the ClaimReview, a structured data schema used by fact check agencies to share their results in search engines, enabling data collect in real time.\nThe FACTCK.BR dataset contains 1309 claims with its corresponding label.", "downloads": 292, "configs": {"default": {"config_name": "default", "sample_row": "{\"url\": \"\\\"https://aosfatos.org/noticias/governo-bolsonaro-n...\", \"author\": \"\\\"https:www.aosfatos.org\\\"\", \"date\": \"\\\"2019-07-22\\\"\", \"claim\": \"\\\"Espa\\\\u00e7o dedicado para os eleitores do Bolsona...\", \"review\": \"\\\"Publica\\\\u00e7\\\\u00f5es que circulam nas redes soci...\", \"title\": \"\\\"Governo Bolsonaro n\\\\u00e3o suspendeu distribui\\\\u0...\", \"rating\": \"1.0\", \"best_rating\": \"5.0\", \"label\": \"0\"}", "columns": ["url", "author", "date", "claim", "review", "title", "rating", "best_rating", "label"], "columns_mapping": {"url": "url", "author": "author", "date": "date", "claim": "claim", "review": "review", "title": "title", "rating": "rating", "best_rating": "best_rating", "label": "label"}, "dataset_description": "A dataset to study Fake News in Portuguese, presenting a supposedly false News along with their respective fact check and classification.\nThe data is collected from the ClaimReview, a structured data schema used by fact check agencies to share their results in search engines, enabling data collect in real time.\nThe FACTCK.BR dataset contains 1309 claims with its corresponding label.\n", "dataset_name": "factckbr"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt"], "is_gated": false}, "fake_news_english": {"dataset_name": "fake_news_english", "description": "Fake news has become a major societal issue and a technical challenge for social media companies to identify. This content is difficult to identify because the term \"fake news\" covers intentionally false, deceptive stories as well as factual errors, satire, and sometimes, stories that a person just does not like. Addressing the problem requires clear definitions and examples. In this work, we present a dataset of fake news and satire stories that are hand coded, verified, and, in the case of fake news, include rebutting stories. We also include a thematic content analysis of the articles, identifying major themes that include hyperbolic support or condemnation of a gure, conspiracy theories, racist themes, and discrediting of reliable sources. In addition to releasing this dataset for research use, we analyze it and show results based on language that are promising for classification purposes. Overall, our contribution of a dataset and initial analysis are designed to support future work by fake news researchers.", "downloads": 346, "configs": {"default": {"config_name": "default", "sample_row": "{\"article_number\": \"375\", \"url_of_article\": \"\\\"http://www.redflagnews.com/headlines-2016/cdc-pro...\", \"fake_or_satire\": \"1\", \"url_of_rebutting_article\": \"\\\"http://www.snopes.com/cdc-forced-vaccinations/\\\"...\"}", "columns": ["article_number", "url_of_article", "fake_or_satire", "url_of_rebutting_article"], "columns_mapping": {"article_number": "article_number", "url_of_article": "url_of_article", "fake_or_satire": "fake_or_satire", "url_of_rebutting_article": "url_of_rebutting_article"}, "dataset_description": "\nFake news has become a major societal issue and a technical challenge for social media companies to identify. This content is difficult to identify because the term \"fake news\" covers intentionally false, deceptive stories as well as factual errors, satire, and sometimes, stories that a person just does not like. Addressing the problem requires clear definitions and examples. In this work, we present a dataset of fake news and satire stories that are hand coded, verified, and, in the case of fake news, include rebutting stories. We also include a thematic content analysis of the articles, identifying major themes that include hyperbolic support or condemnation of a gure, conspiracy theories, racist themes, and discrediting of reliable sources. In addition to releasing this dataset for research use, we analyze it and show results based on language that are promising for classification purposes. Overall, our contribution of a dataset and initial analysis are designed to support future work by fake news researchers.\n", "dataset_name": "fake_news_english"}}, "tags": ["task_categories:text-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "fake_news_filipino": {"dataset_name": "fake_news_filipino", "description": " Low-Resource Fake News Detection Corpora in Filipino. The first of its kind. Contains 3,206 expertly-labeled news samples, half of which are real and half of which are fake.", "downloads": 396, "configs": {"default": {"config_name": "default", "sample_row": "{\"label\": \"0\", \"article\": \"\\\"Ayon sa TheWrap.com, naghain ng kaso si Krupa, 35...\"}", "columns": ["label", "article"], "columns_mapping": {"label": "label", "article": "article"}, "dataset_description": " Low-Resource Fake News Detection Corpora in Filipino. The first of its kind. Contains 3,206 expertly-labeled news samples, half of which are real and half of which are fake.\n", "dataset_name": "fake_news_filipino"}}, "tags": ["task_categories:text-classification", "task_ids:fact-checking", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:tl"], "is_gated": false}, "financial_phrasebank": {"dataset_name": "financial_phrasebank", "description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.", "downloads": 9147, "configs": {"sentences_allagree": {"config_name": "sentences_allagree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}, "sentences_75agree": {"config_name": "sentences_75agree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}, "sentences_66agree": {"config_name": "sentences_66agree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}, "sentences_50agree": {"config_name": "sentences_50agree", "sample_row": "{\"sentence\": \"\\\"According to Gran , the company has no plans to m...\", \"label\": \"1\"}", "columns": ["sentence", "label"], "columns_mapping": {"sentence": "sentence", "label": "label"}, "dataset_description": "The key arguments for the low utilization of statistical techniques in\nfinancial sentiment analysis have been the difficulty of implementation for\npractical applications and the lack of high quality training data for building\nsuch models. Especially in the case of finance and economic texts, annotated\ncollections are a scarce resource and many are reserved for proprietary use\nonly. To resolve the missing training data problem, we present a collection of\n\u223c 5000 sentences to establish human-annotated standards for benchmarking\nalternative modeling techniques.\n\nThe objective of the phrase level annotation task was to classify each example\nsentence into a positive, negative or neutral category by considering only the\ninformation explicitly available in the given sentence. Since the study is\nfocused only on financial and economic domains, the annotators were asked to\nconsider the sentences from the view point of an investor only; i.e. whether\nthe news may have positive, negative or neutral influence on the stock price.\nAs a result, sentences which have a sentiment that is not relevant from an\neconomic or financial perspective are considered neutral.\n\nThis release of the financial phrase bank covers a collection of 4840\nsentences. The selected collection of phrases was annotated by 16 people with\nadequate background knowledge on financial markets. Three of the annotators\nwere researchers and the remaining 13 annotators were master\u2019s students at\nAalto University School of Business with majors primarily in finance,\naccounting, and economics.\n\nGiven the large number of overlapping annotations (5 to 8 annotations per\nsentence), there are several ways to define a majority vote based gold\nstandard. To provide an objective comparison, we have formed 4 alternative\nreference datasets based on the strength of majority agreement: all annotators\nagree, >=75% of annotators agree, >=66% of annotators agree and >=50% of\nannotators agree.\n", "dataset_name": "financial_phrasebank"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "finance"], "is_gated": false}, "finer": {"dataset_name": "finer", "description": "The directory data contains a corpus of Finnish technology related news articles with a manually prepared\nnamed entity annotation (digitoday.2014.csv). The text material was extracted from the archives of Digitoday,\na Finnish online technology news source (www.digitoday.fi). The corpus consists of 953 articles\n(193,742 word tokens) with six named entity classes (organization, location, person, product, event, and date).\nThe corpus is available for research purposes and can be readily used for development of NER systems for Finnish.", "downloads": 289, "configs": {"finer": {"config_name": "finer", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Imperiumi\\\", \\\"laajenee\\\", \\\":\\\", \\\"Maailman\\\", \\\"suurin...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0]\", \"nested_ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "nested_ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "nested_ner_tags": "nested_ner_tags"}, "dataset_description": "The directory data contains a corpus of Finnish technology related news articles with a manually prepared\nnamed entity annotation (digitoday.2014.csv). The text material was extracted from the archives of Digitoday,\na Finnish online technology news source (www.digitoday.fi). The corpus consists of 953 articles\n(193,742 word tokens) with six named entity classes (organization, location, person, product, event, and date).\nThe corpus is available for research purposes and can be readily used for development of NER systems for Finnish.\n", "dataset_name": "finer"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:fi"], "is_gated": false}, "freebase_qa": {"dataset_name": "freebase_qa", "description": "FreebaseQA is for open-domain factoid question answering (QA) tasks over structured knowledge bases, like Freebase The data set is generated by matching trivia-type question-answer pairs with subject-predicateobject triples in Freebase.", "downloads": 487, "configs": {"default": {"config_name": "default", "sample_row": "{\"Question-ID\": \"\\\"FreebaseQA-train-0\\\"\", \"RawQuestion\": \"\\\"What was Pierce Brosnan's first outing as 007?\\\"...\", \"ProcessedQuestion\": \"\\\"what was pierce brosnan's first outing as 007\\\"\", \"Parses.Parse-Id\": \"[\\\"FreebaseQA-train-0.P0\\\", \\\"FreebaseQA-train-0.P1\\\"]...\", \"Parses.PotentialTopicEntityMention\": \"[\\\"007\\\", \\\"pierce brosnan\\\"]\", \"Parses.TopicEntityName\": \"[\\\"james bond\\\", \\\"pierce brosnan\\\"]\", \"Parses.TopicEntityMid\": \"[\\\"m.0clpml\\\", \\\"m.018p4y\\\"]\", \"Parses.InferentialChain\": \"[\\\"film.film_character.portrayed_in_films..film.per...\", \"Parses.Answers\": \"[{\\\"AnswersMid\\\": [\\\"m.01npcx\\\"], \\\"AnswersName\\\": [[\\\"go...\"}", "columns": ["Question-ID", "RawQuestion", "ProcessedQuestion", "Parses_Parse-Id", "Parses_PotentialTopicEntityMention", "Parses_TopicEntityName", "Parses_TopicEntityMid", "Parses_InferentialChain", "Parses_Answers"], "columns_mapping": {"Question-ID": "Question-ID", "RawQuestion": "RawQuestion", "ProcessedQuestion": "ProcessedQuestion", "Parses.Parse-Id": "Parses_Parse-Id", "Parses.PotentialTopicEntityMention": "Parses_PotentialTopicEntityMention", "Parses.TopicEntityName": "Parses_TopicEntityName", "Parses.TopicEntityMid": "Parses_TopicEntityMid", "Parses.InferentialChain": "Parses_InferentialChain", "Parses.Answers": "Parses_Answers"}, "dataset_description": "FreebaseQA is for open-domain factoid question answering (QA) tasks over structured knowledge bases, like Freebase The data set is generated by matching trivia-type question-answer pairs with subject-predicateobject triples in Freebase.\n", "dataset_name": "freebase_qa"}}, "tags": ["task_categories:question-answering", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|trivia_qa", "language:en"], "is_gated": false}, "gap": {"dataset_name": "gap", "description": "GAP is a gender-balanced dataset containing 8,908 coreference-labeled pairs of\n(ambiguous pronoun, antecedent name), sampled from Wikipedia and released by\nGoogle AI Language for the evaluation of coreference resolution in practical\napplications.", "downloads": 328, "configs": {"default": {"config_name": "default", "sample_row": "{\"ID\": \"\\\"development-1\\\"\", \"Text\": \"\\\"Zoe Telford -- played the police officer girlfrie...\", \"Pronoun\": \"\\\"her\\\"\", \"Pronoun-offset\": \"274\", \"A\": \"\\\"Cheryl Cassidy\\\"\", \"A-offset\": \"191\", \"A-coref\": \"true\", \"B\": \"\\\"Pauline\\\"\", \"B-offset\": \"207\", \"B-coref\": \"false\", \"URL\": \"\\\"http://en.wikipedia.org/wiki/List_of_Teachers_(UK...\"}", "columns": ["ID", "Text", "Pronoun", "Pronoun-offset", "A", "A-offset", "A-coref", "B", "B-offset", "B-coref", "URL"], "columns_mapping": {"ID": "ID", "Text": "Text", "Pronoun": "Pronoun", "Pronoun-offset": "Pronoun-offset", "A": "A", "A-offset": "A-offset", "A-coref": "A-coref", "B": "B", "B-offset": "B-offset", "B-coref": "B-coref", "URL": "URL"}, "dataset_description": "\nGAP is a gender-balanced dataset containing 8,908 coreference-labeled pairs of\n(ambiguous pronoun, antecedent name), sampled from Wikipedia and released by\nGoogle AI Language for the evaluation of coreference resolution in practical\napplications.\n", "dataset_name": "gap"}}, "tags": ["task_categories:token-classification", "task_ids:coreference-resolution", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en"], "is_gated": false}, "generics_kb": {"dataset_name": "generics_kb", "description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.", "downloads": 945, "configs": {"generics_kb_best": {"config_name": "generics_kb_best", "sample_row": "{\"source\": \"\\\"Waterloo\\\"\", \"term\": \"\\\"aa battery\\\"\", \"quantifier_frequency\": \"\\\"\\\"\", \"quantifier_number\": \"\\\"\\\"\", \"generic_sentence\": \"\\\"AA batteries maintain the settings if the power e...\", \"score\": \"0.35092294216156006\"}", "columns": ["source", "term", "quantifier_frequency", "quantifier_number", "generic_sentence", "score"], "columns_mapping": {"source": "source", "term": "term", "quantifier_frequency": "quantifier_frequency", "quantifier_number": "quantifier_number", "generic_sentence": "generic_sentence", "score": "score"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}, "generics_kb": {"config_name": "generics_kb", "sample_row": "{\"source\": \"\\\"Waterloo\\\"\", \"term\": \"\\\"a.active replication\\\"\", \"quantifier_frequency\": \"\\\"\\\"\", \"quantifier_number\": \"\\\"\\\"\", \"generic_sentence\": \"\\\"A.Active replication requires all members to exec...\", \"score\": \"0.024261321872472763\"}", "columns": ["source", "term", "quantifier_frequency", "quantifier_number", "generic_sentence", "score"], "columns_mapping": {"source": "source", "term": "term", "quantifier_frequency": "quantifier_frequency", "quantifier_number": "quantifier_number", "generic_sentence": "generic_sentence", "score": "score"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}, "generics_kb_simplewiki": {"config_name": "generics_kb_simplewiki", "sample_row": "{\"source_name\": \"\\\"SimpleWikipedia\\\"\", \"sentence\": \"\\\"Sepsis happens when the bacterium enters the bloo...\", \"sentences_before\": \"[]\", \"sentences_after\": \"[]\", \"concept_name\": \"\\\"sepsis\\\"\", \"quantifiers\": \"[]\", \"id\": \"\\\"SimpleWikipedia--tmp-sw-rs1-with-bug-fixes-initia...\", \"bert_score\": \"0.8396177887916565\", \"headings\": \"[\\\"Bubonic plague\\\", \\\"Different kinds of the same di...\", \"categories\": \"[\\\"Diseases caused by bacteria\\\", \\\"Pulmonology\\\"]\"}", "columns": ["source_name", "sentence", "sentences_before", "sentences_after", "concept_name", "quantifiers", "id", "bert_score", "headings", "categories"], "columns_mapping": {"source_name": "source_name", "sentence": "sentence", "sentences_before": "sentences_before", "sentences_after": "sentences_after", "concept_name": "concept_name", "quantifiers": "quantifiers", "id": "id", "bert_score": "bert_score", "headings": "headings", "categories": "categories"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}, "generics_kb_waterloo": {"config_name": "generics_kb_waterloo", "sample_row": "{\"source_name\": \"\\\"Waterloo\\\"\", \"sentence\": \"\\\"Businesses can also survive by marketing to non-l...\", \"sentences_before\": \"[\\\"The low population also means that there are not...\", \"sentences_after\": \"[\\\"Our town covers an area about 8 blocks long, by ...\", \"concept_name\": \"\\\"business\\\"\", \"quantifiers\": \"[]\", \"id\": \"\\\"Waterloo-sbhaktha-waterloo-clean-node10-of-38-par...\", \"bert_score\": \"0.1443023681640625\"}", "columns": ["source_name", "sentence", "sentences_before", "sentences_after", "concept_name", "quantifiers", "id", "bert_score"], "columns_mapping": {"source_name": "source_name", "sentence": "sentence", "sentences_before": "sentences_before", "sentences_after": "sentences_after", "concept_name": "concept_name", "quantifiers": "quantifiers", "id": "id", "bert_score": "bert_score"}, "dataset_description": "The GenericsKB contains 3.4M+ generic sentences about the world, i.e., sentences expressing general truths such as \"Dogs bark,\" and \"Trees remove carbon dioxide from the atmosphere.\" Generics are potentially useful as a knowledge source for AI systems requiring general world knowledge. The GenericsKB is the first large-scale resource containing naturally occurring generic sentences (as opposed to extracted or crowdsourced triples), and is rich in high-quality, general, semantically complete statements. Generics were primarily extracted from three large text sources, namely the Waterloo Corpus, selected parts of Simple Wikipedia, and the ARC Corpus. A filtered, high-quality subset is also available in GenericsKB-Best, containing 1,020,868 sentences. We recommend you start with GenericsKB-Best.\n", "dataset_name": "generics_kb"}}, "tags": ["task_categories:other", "annotations_creators:machine-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "knowledge-base"], "is_gated": false}, "germaner": {"dataset_name": "germaner", "description": "GermaNER is a freely available statistical German Named Entity Tagger based on conditional random fields(CRF). The tagger is trained and evaluated on the NoSta-D Named Entity dataset, which was used in the GermEval 2014 for named entity recognition. The tagger comes close to the performance of the best (proprietary) system in the competition with 77% F-measure (this is the latest result; the one reported in the paper is 76%) test set performance on the four standard NER classes (PERson, LOCation, ORGanisation and OTHer).\n\nWe describe a range of features and their influence on German NER classification and provide a comparative evaluation and some analysis of the results. The software components, the training data and all data used for feature generation are distributed under permissive licenses, thus this tagger can be used in academic and commercial settings without restrictions or fees. The tagger is available as a command-line tool and as an Apache UIMA component.", "downloads": 316, "configs": {"default": {"config_name": "default", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Schartau\\\", \\\"sagte\\\", \\\"dem\\\", \\\"\\\\\\\"\\\", \\\"Tagesspiegel\\\",...\", \"ner_tags\": \"[3, 8, 8, 8, 1, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "GermaNER is a freely available statistical German Named Entity Tagger based on conditional random fields(CRF). The tagger is trained and evaluated on the NoSta-D Named Entity dataset, which was used in the GermEval 2014 for named entity recognition. The tagger comes close to the performance of the best (proprietary) system in the competition with 77% F-measure (this is the latest result; the one reported in the paper is 76%) test set performance on the four standard NER classes (PERson, LOCation, ORGanisation and OTHer).\n\nWe describe a range of features and their influence on German NER classification and provide a comparative evaluation and some analysis of the results. The software components, the training data and all data used for feature generation are distributed under permissive licenses, thus this tagger can be used in academic and commercial settings without restrictions or fees. The tagger is available as a command-line tool and as an Apache UIMA component.\n", "dataset_name": "germaner"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:de"], "is_gated": false}, "giga_fren": {"dataset_name": "giga_fren", "description": "Giga-word corpus for French-English from WMT2010 collected by Chris Callison-Burch\n2 languages, total number of files: 452\ntotal number of tokens: 1.43G\ntotal number of sentence fragments: 47.55M", "downloads": 291, "configs": {"en-fr": {"config_name": "en-fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"translation.en\": \"\\\"Changing Lives _BAR_ Changing Society _BAR_ How I...\", \"translation.fr\": \"\\\"Il a transform\\\\u00e9 notre vie _BAR_ Il a transfo...\"}", "columns": ["id", "translation_en", "translation_fr"], "columns_mapping": {"id": "id", "translation.en": "translation_en", "translation.fr": "translation_fr"}, "dataset_description": "Giga-word corpus for French-English from WMT2010 collected by Chris Callison-Burch\n2 languages, total number of files: 452\ntotal number of tokens: 1.43G\ntotal number of sentence fragments: 47.55M\n", "dataset_name": "giga_fren"}}, "tags": ["task_categories:translation", "annotations_creators:found", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:fr"], "is_gated": false}, "gnad10": {"dataset_name": "gnad10", "description": "This dataset is intended to advance topic classification for German texts. A classifier that is efffective in\nEnglish may not be effective in German dataset because it has a higher inflection and longer compound words.\nThe 10kGNAD dataset contains 10273 German news articles from an Austrian online newspaper categorized into\n9 categories. Article titles and text are concatenated together and authors are removed to avoid a keyword-like\nclassification on authors that write frequently about one category. This dataset can be used as a benchmark\nfor German topic classification.", "downloads": 532, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"21-J\\\\u00e4hriger f\\\\u00e4llt wohl bis Saisonende a...\", \"label\": \"4\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "This dataset is intended to advance topic classification for German texts. A classifier that is efffective in\nEnglish may not be effective in German dataset because it has a higher inflection and longer compound words.\nThe 10kGNAD dataset contains 10273 German news articles from an Austrian online newspaper categorized into\n9 categories. Article titles and text are concatenated together and authors are removed to avoid a keyword-like\nclassification on authors that write frequently about one category. This dataset can be used as a benchmark\nfor German topic classification.\n", "dataset_name": "gnad10"}}, "tags": ["task_categories:text-classification", "task_ids:topic-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|other-from-One-Million-Posts-Corpus", "language:de"], "is_gated": false}, "go_emotions": {"dataset_name": "go_emotions", "description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.", "downloads": 7166, "configs": {"raw": {"config_name": "raw", "sample_row": "{\"text\": \"\\\"That game hurt.\\\"\", \"id\": \"\\\"eew5j0j\\\"\", \"author\": \"\\\"Brdd9\\\"\", \"subreddit\": \"\\\"nrl\\\"\", \"link_id\": \"\\\"t3_ajis4z\\\"\", \"parent_id\": \"\\\"t1_eew18eq\\\"\", \"created_utc\": \"1548381039.0\", \"rater_id\": \"1\", \"example_very_unclear\": \"false\", \"admiration\": \"0\", \"amusement\": \"0\", \"anger\": \"0\", \"annoyance\": \"0\", \"approval\": \"0\", \"caring\": \"0\", \"confusion\": \"0\", \"curiosity\": \"0\", \"desire\": \"0\", \"disappointment\": \"0\", \"disapproval\": \"0\", \"disgust\": \"0\", \"embarrassment\": \"0\", \"excitement\": \"0\", \"fear\": \"0\", \"gratitude\": \"0\", \"grief\": \"0\", \"joy\": \"0\", \"love\": \"0\", \"nervousness\": \"0\", \"optimism\": \"0\", \"pride\": \"0\", \"realization\": \"0\", \"relief\": \"0\", \"remorse\": \"0\", \"sadness\": \"1\", \"surprise\": \"0\", \"neutral\": \"0\"}", "columns": ["text", "id", "author", "subreddit", "link_id", "parent_id", "created_utc", "rater_id", "example_very_unclear", "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"], "columns_mapping": {"text": "text", "id": "id", "author": "author", "subreddit": "subreddit", "link_id": "link_id", "parent_id": "parent_id", "created_utc": "created_utc", "rater_id": "rater_id", "example_very_unclear": "example_very_unclear", "admiration": "admiration", "amusement": "amusement", "anger": "anger", "annoyance": "annoyance", "approval": "approval", "caring": "caring", "confusion": "confusion", "curiosity": "curiosity", "desire": "desire", "disappointment": "disappointment", "disapproval": "disapproval", "disgust": "disgust", "embarrassment": "embarrassment", "excitement": "excitement", "fear": "fear", "gratitude": "gratitude", "grief": "grief", "joy": "joy", "love": "love", "nervousness": "nervousness", "optimism": "optimism", "pride": "pride", "realization": "realization", "relief": "relief", "remorse": "remorse", "sadness": "sadness", "surprise": "surprise", "neutral": "neutral"}, "dataset_description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.\n", "dataset_name": "go_emotions"}, "simplified": {"config_name": "simplified", "sample_row": "{\"text\": \"\\\"My favourite food is anything I didn't have to co...\", \"labels\": \"[27]\", \"id\": \"\\\"eebbqej\\\"\"}", "columns": ["text", "labels", "id"], "columns_mapping": {"text": "text", "labels": "labels", "id": "id"}, "dataset_description": "The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.\n", "dataset_name": "go_emotions"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "task_ids:multi-label-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:en", "emotion"], "is_gated": false}, "google_wellformed_query": {"dataset_name": "google_wellformed_query", "description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.", "downloads": 486, "configs": {"default": {"config_name": "default", "sample_row": "{\"rating\": \"0.2\", \"content\": \"\\\"The European Union includes how many ?\\\"\"}", "columns": ["rating", "content"], "columns_mapping": {"rating": "rating", "content": "content"}, "dataset_description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.\n", "dataset_name": "google_wellformed_query"}}, "tags": ["task_categories:text-classification", "task_ids:text-scoring", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended", "language:en"], "is_gated": false}, "grail_qa": {"dataset_name": "grail_qa", "description": "Strongly Generalizable Question Answering (GrailQA) is a new large-scale, high-quality dataset for question answering on knowledge bases (KBQA) on Freebase with 64,331 questions annotated with both answers and corresponding logical forms in different syntax (i.e., SPARQL, S-expression, etc.). It can be used to test three levels of generalization in KBQA: i.i.d., compositional, and zero-shot.", "downloads": 378, "configs": {"default": {"config_name": "default", "sample_row": "{\"qid\": \"\\\"2101535001000\\\"\", \"question\": \"\\\"oxybutynin chloride 5 extended release film coate...\", \"answer.answer_type\": \"[\\\"Entity\\\", \\\"Entity\\\"]\", \"answer.answer_argument\": \"[\\\"m.0z3xfvs\\\", \\\"m.0z3xm0m\\\"]\", \"answer.entity_name\": \"[\\\"Oxybutynin Oral\\\", \\\"Oxybutynin Chloride Oral\\\"]\", \"function\": \"\\\"none\\\"\", \"num_node\": \"2\", \"num_edge\": \"1\", \"graph_query.nodes.nid\": \"[0, 1]\", \"graph_query.nodes.node_type\": \"[\\\"class\\\", \\\"entity\\\"]\", \"graph_query.nodes.id\": \"[\\\"medicine.routed_drug\\\", \\\"m.0hqs1x_\\\"]\", \"graph_query.nodes.class\": \"[\\\"medicine.routed_drug\\\", \\\"medicine.drug_formulatio...\", \"graph_query.nodes.friendly_name\": \"[\\\"Routed drug\\\", \\\"Oxybutynin chloride 5 extended re...\", \"graph_query.nodes.question_node\": \"[1, 0]\", \"graph_query.nodes.function\": \"[\\\"none\\\", \\\"none\\\"]\", \"graph_query.edges.start\": \"[0]\", \"graph_query.edges.end\": \"[1]\", \"graph_query.edges.relation\": \"[\\\"medicine.routed_drug.marketed_formulations\\\"]\", \"graph_query.edges.friendly_name\": \"[\\\"Marketed formulations\\\"]\", \"sparql_query\": \"\\\"PREFIX rdf: 894be9b4...\", \"unique_id\": \"\\\"1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4...\", \"excerpt_index\": \"11\"}", "columns": ["source", "citeStart", "sectionName", "string", "citeEnd", "label", "label_confidence", "label2", "label2_confidence", "citingPaperId", "citedPaperId", "isKeyCitation", "id", "unique_id", "excerpt_index"], "columns_mapping": {"source": "source", "citeStart": "citeStart", "sectionName": "sectionName", "string": "string", "citeEnd": "citeEnd", "label": "label", "label_confidence": "label_confidence", "label2": "label2", "label2_confidence": "label2_confidence", "citingPaperId": "citingPaperId", "citedPaperId": "citedPaperId", "isKeyCitation": "isKeyCitation", "id": "id", "unique_id": "unique_id", "excerpt_index": "excerpt_index"}, "dataset_description": "SciCite is a dataset of 11K manually annotated citation intents based on\ncitation context in the computer science and biomedical domains.\n", "dataset_name": "bigbio/scicite"}, "scicite_bigbio_text": {"config_name": "scicite_bigbio_text", "sample_row": "{\"id\": \"\\\"1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4...\", \"document_id\": \"\\\"1872080baa7d30ec8fb87be9a65358cd3a7fb649\\\"\", \"text\": \"\\\"However, how frataxin interacts with the Fe-S clu...\", \"labels\": \"[\\\"background\\\"]\"}", "columns": ["id", "document_id", "text", "labels"], "columns_mapping": {"id": "id", "document_id": "document_id", "text": "text", "labels": "labels"}, "dataset_description": "SciCite is a dataset of 11K manually annotated citation intents based on\ncitation context in the computer science and biomedical domains.\n", "dataset_name": "bigbio/scicite"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/tmvar_v1": {"dataset_name": "bigbio/tmvar_v1", "description": "This dataset contains 500 PubMed articles manually annotated with mutation mentions of various kinds. It can be used for NER tasks only.\nThe dataset is split into train(334) and test(166) splits", "downloads": 50, "configs": {"tmvar_v1_source": {"config_name": "tmvar_v1_source", "sample_row": "{\"pmid\": \"\\\"22016685\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": \\\"A novel missense mutat...\", \"entities\": \"[{\\\"offsets\\\": [26, 35], \\\"text\\\": \\\"Asp506Gly\\\", \\\"seman...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "This dataset contains 500 PubMed articles manually annotated with mutation mentions of various kinds. It can be used for NER tasks only.\nThe dataset is split into train(334) and test(166) splits", "dataset_name": "bigbio/tmvar_v1"}, "tmvar_v1_bigbio_kb": {"config_name": "tmvar_v1_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"22016685\\\"\", \"passages\": \"[{\\\"id\\\": \\\"5\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"A novel mi...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"ProteinMutation\\\", \\\"text\\\": [\\\"...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "This dataset contains 500 PubMed articles manually annotated with mutation mentions of various kinds. It can be used for NER tasks only.\nThe dataset is split into train(334) and test(166) splits", "dataset_name": "bigbio/tmvar_v1"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/tmvar_v2": {"dataset_name": "bigbio/tmvar_v2", "description": "This dataset contains 158 PubMed articles manually annotated with mutation mentions of various kinds and dbsnp normalizations for each of them.\nIt can be used for NER tasks and NED tasks, This dataset has a single split", "downloads": 63, "configs": {"tmvar_v2_source": {"config_name": "tmvar_v2_source", "sample_row": "{\"pmid\": \"\\\"22051099\\\"\", \"passages\": \"[{\\\"type\\\": \\\"title\\\", \\\"text\\\": \\\"Variation in the CXCR1...\", \"entities\": \"[{\\\"offsets\\\": [327, 336], \\\"text\\\": \\\"rs2234671\\\", \\\"sem...\"}", "columns": ["pmid", "passages", "entities"], "columns_mapping": {"pmid": "pmid", "passages": "passages", "entities": "entities"}, "dataset_description": "This dataset contains 158 PubMed articles manually annotated with mutation mentions of various kinds and dbsnp normalizations for each of them.\nIt can be used for NER tasks and NED tasks, This dataset has a single split", "dataset_name": "bigbio/tmvar_v2"}, "tmvar_v2_bigbio_kb": {"config_name": "tmvar_v2_bigbio_kb", "sample_row": "{\"id\": \"\\\"0\\\"\", \"document_id\": \"\\\"22051099\\\"\", \"passages\": \"[{\\\"id\\\": \\\"6\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\": [\\\"Variation ...\", \"entities\": \"[{\\\"id\\\": \\\"1\\\", \\\"type\\\": \\\"SNP\\\", \\\"text\\\": [\\\"rs2234671\\\"],...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[]\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "This dataset contains 158 PubMed articles manually annotated with mutation mentions of various kinds and dbsnp normalizations for each of them.\nIt can be used for NER tasks and NED tasks, This dataset has a single split", "dataset_name": "bigbio/tmvar_v2"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "PlanTL-GOB-ES/sts-es": {"dataset_name": "PlanTL-GOB-ES/sts-es", "description": "For Semantic Text Similarity, we collected the Spanish test sets from SemEval-2014 (Agirre et al., 2014) and SemEval-2015 (Agirre et al., 2015). Since no training data was provided for the Spanish subtask, we randomly sampled both datasets into 1,321 sentences for the train set, 78 sentences for the development set, and 156 sentences for the test set. To make the task harder for the models, we purposely made the development set smaller than the test set.", "downloads": 63, "configs": {"STS": {"config_name": "STS", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence1\": \"\\\"Seg\\\\u00fan el sondeo, 87% de los cat\\\\u00f3licos c...\", \"sentence2\": \"\\\"El 87% de los cat\\\\u00f3licos del mundo aprobaron ...\", \"label\": \"3.75\"}", "columns": ["id", "sentence1", "sentence2", "label"], "columns_mapping": {"id": "id", "sentence1": "sentence1", "sentence2": "sentence2", "label": "label"}, "dataset_description": "\nFor Semantic Text Similarity, we collected the Spanish test sets from SemEval-2014 (Agirre et al., 2014) and SemEval-2015 (Agirre et al., 2015). Since no training data was provided for the Spanish subtask, we randomly sampled both datasets into 1,321 sentences for the train set, 78 sentences for the development set, and 156 sentences for the test set. To make the task harder for the models, we purposely made the development set smaller than the test set.\n", "dataset_name": "PlanTL-GOB-ES/sts-es"}}, "tags": ["task_categories:text-classification", "task_ids:semantic-similarity-scoring", "task_ids:text-scoring", "annotations_creators:expert-generated", "multilinguality:monolingual", "language:es"], "is_gated": false}, "PlanTL-GOB-ES/WikiCAT_esv2": {"dataset_name": "PlanTL-GOB-ES/WikiCAT_esv2", "description": "WikiCAT: Text Classification Spanish dataset from the Viquipedia", "downloads": 47, "configs": {"wikiCAT_es": {"config_name": "wikiCAT_es", "sample_row": "{\"text\": \"\\\"En estad\\\\u00edstica, un modelo probit es un tipo ...\", \"label\": \"5\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "\n WikiCAT: Text Classification Spanish dataset from the Viquipedia\n\n ", "dataset_name": "PlanTL-GOB-ES/WikiCAT_esv2"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:automatically-generated", "multilinguality:monolingual", "language:es"], "is_gated": false}, "jeanlee/kmhas_korean_hate_speech": {"dataset_name": "jeanlee/kmhas_korean_hate_speech", "description": "The K-MHaS (Korean Multi-label Hate Speech) dataset contains 109k utterances from Korean online news comments labeled with 8 fine-grained hate speech classes or Not Hate Speech class.\nThe fine-grained hate speech classes are politics, origin, physical, age, gender, religion, race, and profanity and these categories are selected in order to reflect the social and historical context.", "downloads": 555, "configs": {"default": {"config_name": "default", "sample_row": "{\"text\": \"\\\"\\\\\\\"\\\\uc790\\\\ud55c\\\\ub2f9\\\\ud2c0\\\\ub531\\\\ub4e4.. \\\\uc545\\\\u...\", \"label\": \"[2, 4]\"}", "columns": ["text", "label"], "columns_mapping": {"text": "text", "label": "label"}, "dataset_description": "The K-MHaS (Korean Multi-label Hate Speech) dataset contains 109k utterances from Korean online news comments labeled with 8 fine-grained hate speech classes or Not Hate Speech class.\nThe fine-grained hate speech classes are politics, origin, physical, age, gender, religion, race, and profanity and these categories are selected in order to reflect the social and historical context.\n", "dataset_name": "jeanlee/kmhas_korean_hate_speech"}}, "tags": ["task_categories:text-classification", "task_ids:multi-label-classification", "task_ids:hate-speech-detection", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ko", "K-MHaS", "Korean NLP", "Hate Speech Detection", "Dataset", "Coling2022"], "is_gated": false}, "gsarti/mt_geneval": {"dataset_name": "gsarti/mt_geneval", "description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.", "downloads": 41, "configs": {"sentences_en_ar": {"config_name": "sentences_en_ar", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"At 12 years old, she became an assistant stick gi...\", \"reference_feminine\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_masculine\": \"\\\"At 12 years old, he became an assistant stick boy...\", \"reference_masculine\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_feminine_annotated\": \"\\\"At 12 years old, she became an assistant s...\", \"reference_feminine_annotated\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_masculine_annotated\": \"\\\"At 12 years old, he became an assistant st...\", \"reference_masculine_annotated\": \"\\\"\\\\u0641\\\\u064a \\\\u0633\\\\u0646 \\\\u0627\\\\u0644\\\\u062b\\\\u062...\", \"source_feminine_keywords\": \"\\\"she;girl\\\"\", \"reference_feminine_keywords\": \"\\\"\\\\u0623\\\\u0635\\\\u0628\\\\u062d\\\\u062a \\\\u062d\\\\u0627\\\\u0645...\", \"source_masculine_keywords\": \"\\\"he;boy\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u0623\\\\u0635\\\\u0628\\\\u062d \\\\u062d\\\\u0627\\\\u0645\\\\u0644...\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_ar": {"config_name": "context_en_ar", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"He continued to gain recognition as an applied ar...\", \"source\": \"\\\"After these wins, Brodovitch's career as an appli...\", \"reference_original\": \"\\\"\\\\u0628\\\\u0639\\\\u062f \\\\u0647\\\\u0630\\\\u0647 \\\\u0627\\\\u064...\", \"reference_flipped\": \"\\\"\\\\u0628\\\\u0639\\\\u062f \\\\u0647\\\\u0630\\\\u0647 \\\\u0627\\\\u064...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_fr": {"config_name": "sentences_en_fr", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Career She started her singing career in 2003 wit...\", \"reference_feminine\": \"\\\"Carri\\\\u00e8re Elle d\\\\u00e9buta sa carri\\\\u00e8re d...\", \"source_masculine\": \"\\\"Career He started his singing career in 2003 with...\", \"reference_masculine\": \"\\\"Carri\\\\u00e8re Il d\\\\u00e9buta sa carri\\\\u00e8re de ...\", \"source_feminine_annotated\": \"\\\"Career She started her singing care...\", \"reference_feminine_annotated\": \"\\\"Carri\\\\u00e8re Elle d\\\\u00e9buta sa carri\\\\u0...\", \"source_masculine_annotated\": \"\\\"Career He started his singing caree...\", \"reference_masculine_annotated\": \"\\\"Carri\\\\u00e8re Il d\\\\u00e9buta sa carri\\\\u00e...\", \"source_feminine_keywords\": \"\\\"She;her;her;her\\\"\", \"reference_feminine_keywords\": \"\\\"Elle;chanteuse\\\"\", \"source_masculine_keywords\": \"\\\"He;his;his;his\\\"\", \"reference_masculine_keywords\": \"\\\"Il;chanteur\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_fr": {"config_name": "context_en_fr", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"He then went to Sydney and then Melbourne holding...\", \"source\": \"\\\"Evergood was a capable artist, who mostly painted...\", \"reference_original\": \"\\\"Evergood \\\\u00e9tait un artiste comp\\\\u00e9tent, qu...\", \"reference_flipped\": \"\\\"Evergood \\\\u00e9tait une artiste comp\\\\u00e9tente, ...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_de": {"config_name": "sentences_en_de", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_feminine\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_masculine\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_masculine\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_feminine_annotated\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_feminine_annotated\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_masculine_annotated\": \"\\\"After some wrangling Blacket accepted \\\\u00a350 in...\", \"reference_masculine_annotated\": \"\\\"Nach einigem Hin und Her akzeptierte Blacket 50 P...\", \"source_feminine_keywords\": \"\\\"her\\\"\", \"reference_feminine_keywords\": \"\\\"ihr\\\"\", \"source_masculine_keywords\": \"\\\"him\\\"\", \"reference_masculine_keywords\": \"\\\"ihm\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_de": {"config_name": "context_en_de", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"As Professor of Painting at the Royal Academy he ...\", \"source\": \"\\\"Clausen was an official war artist during World W...\", \"reference_original\": \"\\\"Clausen war ein offizieller Kriegsk\\\\u00fcnstler w...\", \"reference_flipped\": \"\\\"Clausen war eine offizielle Kriegsk\\\\u00fcnstlerin...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_hi": {"config_name": "sentences_en_hi", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"In 1898, the empress authorized the creation of a...\", \"reference_feminine\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u093e\\\\u092e\\\\u094d...\", \"source_masculine\": \"\\\"In 1898, the emperor authorized the creation of a...\", \"reference_masculine\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u092e\\\\u094d\\\\u0930...\", \"source_feminine_annotated\": \"\\\"In 1898, the empress authorized the creati...\", \"reference_feminine_annotated\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u093e\\\\u092e\\\\u0...\", \"source_masculine_annotated\": \"\\\"In 1898, the emperor authorized the creati...\", \"reference_masculine_annotated\": \"\\\"1898 \\\\u092e\\\\u0947\\\\u0902, \\\\u0938\\\\u092e\\\\u094d\\\\u0...\", \"source_feminine_keywords\": \"\\\"empress\\\"\", \"reference_feminine_keywords\": \"\\\"\\\\u0938\\\\u093e\\\\u092e\\\\u094d\\\\u0930\\\\u093e\\\\u091c\\\\u094d\\\\...\", \"source_masculine_keywords\": \"\\\"emperor\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u0938\\\\u092e\\\\u094d\\\\u0930\\\\u093e\\\\u091f\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_hi": {"config_name": "context_en_hi", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"The story of GPA's downfall is told by Christophe...\", \"source\": \"\\\"It is based on a contemporaneous diary of events ...\", \"reference_original\": \"\\\"\\\\u092f\\\\u0939 1990 \\\\u0938\\\\u0947 1996 \\\\u0924\\\\u0915 ...\", \"reference_flipped\": \"\\\"\\\\u092f\\\\u0939 1990 \\\\u0938\\\\u0947 1996 \\\\u0924\\\\u0915 ...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_it": {"config_name": "sentences_en_it", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Pagratidis quickly recanted her confession, claim...\", \"reference_feminine\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_masculine\": \"\\\"Pagratidis quickly recanted his confession, claim...\", \"reference_masculine\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_feminine_annotated\": \"\\\"Pagratidis quickly recanted her confession...\", \"reference_feminine_annotated\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_masculine_annotated\": \"\\\"Pagratidis quickly recanted his confession...\", \"reference_masculine_annotated\": \"\\\"Pagratidis subito ritratt\\\\u00f2 la sua confession...\", \"source_feminine_keywords\": \"\\\"her;she;her;she;her\\\"\", \"reference_feminine_keywords\": \"\\\"stata picchiata;ferma\\\"\", \"source_masculine_keywords\": \"\\\"his;he;his;he;his\\\"\", \"reference_masculine_keywords\": \"\\\"stato picchiato;fermo\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_it": {"config_name": "context_en_it", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"Pierpont told of entering and holding up the bank...\", \"source\": \"\\\"However, Pierpont stated that Skeer was the plann...\", \"reference_original\": \"\\\"Comunque, Pierpont disse che Skeer era il pianifi...\", \"reference_flipped\": \"\\\"Comunque, Pierpont disse che Skeer era la pianifi...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_pt": {"config_name": "sentences_en_pt", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"In variants of Bluebeard, the wife's curiosity is...\", \"reference_feminine\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_masculine\": \"\\\"In variants of Bluebeard, the husband's curiosity...\", \"reference_masculine\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_feminine_annotated\": \"\\\"In variants of Bluebeard, the wife's curio...\", \"reference_feminine_annotated\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_masculine_annotated\": \"\\\"In variants of Bluebeard, the husband's cu...\", \"reference_masculine_annotated\": \"\\\"Em varia\\\\u00e7\\\\u00f5es da hist\\\\u00f3ria do Barba ...\", \"source_feminine_keywords\": \"\\\"wife's;she\\\"\", \"reference_feminine_keywords\": \"\\\"da esposa;ela\\\"\", \"source_masculine_keywords\": \"\\\"husband's;he\\\"\", \"reference_masculine_keywords\": \"\\\"do marido;ele\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_pt": {"config_name": "context_en_pt", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"He wrote a retrospective of his work and its cont...\", \"source\": \"\\\"Goguen was a practitioner of Tibetan Buddhism.\\\"...\", \"reference_original\": \"\\\"Goguen era um praticante do Budismo Tibetano.\\\"\", \"reference_flipped\": \"\\\"Goguen era uma praticante do Budismo Tibetano.\\\"...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_ru": {"config_name": "sentences_en_ru", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Mrs Duncan supported the idea, and government bod...\", \"reference_feminine\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0436\\\\u0430 \\\\u0414...\", \"source_masculine\": \"\\\"Mr Duncan supported the idea, and government bodi...\", \"reference_masculine\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0434\\\\u0438\\\\u043d ...\", \"source_feminine_annotated\": \"\\\"Mrs Duncan supported the idea, and governm...\", \"reference_feminine_annotated\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0436\\\\u0430...\", \"source_masculine_annotated\": \"\\\"Mr Duncan supported the idea, and governme...\", \"reference_masculine_annotated\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0434\\\\u0438\\\\u04...\", \"source_feminine_keywords\": \"\\\"Mrs\\\"\", \"reference_feminine_keywords\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0436\\\\u0430;\\\\u043f...\", \"source_masculine_keywords\": \"\\\"Mr\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u0413\\\\u043e\\\\u0441\\\\u043f\\\\u043e\\\\u0434\\\\u0438\\\\u043d;...\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_ru": {"config_name": "context_en_ru", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"\\\\u201cDictators are stupid,\\\\u201d he noted, \\\\u201...\", \"source\": \"\\\"As a young artist in Baghdad in the 1980s, Alfraj...\", \"reference_original\": \"\\\"\\\\u0412 1980-\\\\u0445 \\\\u0433\\\\u043e\\\\u0434\\\\u0430\\\\u0445...\", \"reference_flipped\": \"\\\"\\\\u0412 1980-\\\\u0445 \\\\u0433\\\\u043e\\\\u0434\\\\u0430\\\\u0445...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "sentences_en_es": {"config_name": "sentences_en_es", "sample_row": "{\"orig_id\": \"0\", \"source_feminine\": \"\\\"Morgan was asked by Fey to play the role, and she...\", \"reference_feminine\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_masculine\": \"\\\"Morgan was asked by Fey to play the role, and he ...\", \"reference_masculine\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_feminine_annotated\": \"\\\"Morgan was asked by Fey to play the role, and ...\", \"reference_feminine_annotated\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_masculine_annotated\": \"\\\"Morgan was asked by Fey to play the role, and ...\", \"reference_masculine_annotated\": \"\\\"Fey le pidi\\\\u00f3 a Morgan que interpretara el pa...\", \"source_feminine_keywords\": \"\\\"she;her;her\\\"\", \"reference_feminine_keywords\": \"\\\"ella\\\"\", \"source_masculine_keywords\": \"\\\"he;his;him\\\"\", \"reference_masculine_keywords\": \"\\\"\\\\u00e9l\\\"\"}", "columns": ["orig_id", "source_feminine", "reference_feminine", "source_masculine", "reference_masculine", "source_feminine_annotated", "reference_feminine_annotated", "source_masculine_annotated", "reference_masculine_annotated", "source_feminine_keywords", "reference_feminine_keywords", "source_masculine_keywords", "reference_masculine_keywords"], "columns_mapping": {"orig_id": "orig_id", "source_feminine": "source_feminine", "reference_feminine": "reference_feminine", "source_masculine": "source_masculine", "reference_masculine": "reference_masculine", "source_feminine_annotated": "source_feminine_annotated", "reference_feminine_annotated": "reference_feminine_annotated", "source_masculine_annotated": "source_masculine_annotated", "reference_masculine_annotated": "reference_masculine_annotated", "source_feminine_keywords": "source_feminine_keywords", "reference_feminine_keywords": "reference_feminine_keywords", "source_masculine_keywords": "source_masculine_keywords", "reference_masculine_keywords": "reference_masculine_keywords"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}, "context_en_es": {"config_name": "context_en_es", "sample_row": "{\"orig_id\": \"0\", \"context\": \"\\\"In 1994\\\\u201395, he conducted a research project ...\", \"source\": \"\\\"Ritchin is a prolific author and curator, focusin...\", \"reference_original\": \"\\\"Ritchin es un autor y conservador prol\\\\u00edfico,...\", \"reference_flipped\": \"\\\"Ritchin es una autora y conservadora prol\\\\u00edfi...\"}", "columns": ["orig_id", "context", "source", "reference_original", "reference_flipped"], "columns_mapping": {"orig_id": "orig_id", "context": "context", "source": "source", "reference_original": "reference_original", "reference_flipped": "reference_flipped"}, "dataset_description": "The MT-GenEval benchmark evaluates gender translation accuracy on English -> {Arabic, French, German, Hindi, Italian, \nPortuguese, Russian, Spanish}. The dataset contains individual sentences with annotations on the gendered target words,\nand contrastive original-invertend translations with additional preceding context.\n", "dataset_name": "gsarti/mt_geneval"}}, "tags": ["task_categories:translation", "annotations_creators:expert-generated", "multilinguality:translation", "source_datasets:original", "language:en", "language:it", "language:fr", "language:ar", "language:de", "language:hi", "language:pt", "language:ru", "language:es", "gender", "constrained mt"], "is_gated": false}, "sagnikrayc/snli-cf-kaushik": {"dataset_name": "sagnikrayc/snli-cf-kaushik", "description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). In the ICLR 2020 paper [Learning the Difference that Makes a Difference with Counterfactually-Augmented Data](https://openreview.net/forum?id=Sklgs0NFvr), Kaushik et. al. provided a dataset with counterfactual perturbations on the SNLI and IMDB data. This repository contains the original and counterfactual perturbations for the SNLI data, which was generated after processing the original data from [here](https://github.com/acmi-lab/counterfactually-augmented-data).", "downloads": 15, "configs": {"plain_text": {"config_name": "plain_text", "sample_row": "{\"idx\": \"\\\"3021531305.jpg#0r1n-orig\\\"\", \"premise\": \"\\\"A man is riding a red motorcycle with a small chi...\", \"hypothesis\": \"\\\"A man rides his motorcyle with his won.\\\"\", \"label\": \"\\\"neutral\\\"\", \"type\": \"\\\"original\\\"\"}", "columns": ["idx", "premise", "hypothesis", "label", "type"], "columns_mapping": {"idx": "idx", "premise": "premise", "hypothesis": "hypothesis", "label": "label", "type": "type"}, "dataset_description": "The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs manually labeled for balanced classification with the labels entailment, contradiction, and neutral, supporting the task of natural language inference (NLI), also known as recognizing textual entailment (RTE). In the ICLR 2020 paper [Learning the Difference that Makes a Difference with Counterfactually-Augmented Data](https://openreview.net/forum?id=Sklgs0NFvr), Kaushik et. al. provided a dataset with counterfactual perturbations on the SNLI and IMDB data. This repository contains the original and counterfactual perturbations for the SNLI data, which was generated after processing the original data from [here](https://github.com/acmi-lab/counterfactually-augmented-data).", "dataset_name": "sagnikrayc/snli-cf-kaushik"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:extended|snli", "language:en"], "is_gated": false}, "vesteinn/swe-nerc": {"dataset_name": "vesteinn/swe-nerc", "description": "The corpus consists of ca. 150.000 words of text.", "downloads": 199, "configs": {"swe-nerc": {"config_name": "swe-nerc", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Det\\\", \\\"har\\\", \\\"iaf\\\", \\\"jag\\\", \\\"gjort\\\", \\\"men\\\", \\\"ska\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "The corpus consists of ca. 150.000 words of text.\n", "dataset_name": "vesteinn/swe-nerc"}}, "tags": [], "is_gated": false}, "shunk031/jsnli": {"dataset_name": "shunk031/jsnli", "description": "== \u65e5\u672c\u8a9eSNLI(JSNLI)\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 ==\n\nSNLI \u30b3\u30fc\u30d1\u30b9\u3092\u65e5\u672c\u8a9e\u306b\u7ffb\u8a33\u3057\u305f\u81ea\u7136\u8a00\u8a9e\u63a8\u8ad6\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\n\u5b66\u7fd2\u30c7\u30fc\u30bf\u306f\u5143\u30c7\u30fc\u30bf\u3092\u7ffb\u8a33\u3057\u3001\u8a08\u7b97\u6a5f\u306b\u3088\u308b\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u306b\u3088\u3063\u3066\u4f5c\u6210\n\u8a55\u4fa1\u30c7\u30fc\u30bf\u306f\u65e5\u672c\u8a9e\u3068\u3057\u3066\u610f\u5473\u304c\u901a\u308b\u304b\u3001\u7ffb\u8a33\u5f8c\u306e\u30e9\u30d9\u30eb\u304c\u5143\u306e\u30e9\u30d9\u30eb\u3068\u4e00\u81f4\u3057\u3066\u3044\u308b\u304b\u3069\u3046\u304b\u306e2\u6bb5\u968e\u306e\u30af\u30e9\u30a6\u30c9\u30bd\u30fc\u30b7\u30f3\u30b0\u306b\u3088\u308a\u30c7\u30fc\u30bf\u3092\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0", "downloads": 52, "configs": {"with-filtering": {"config_name": "with-filtering", "sample_row": "{\"premise\": \"\\\"\\\\u30ac\\\\u30ec\\\\u30fc\\\\u30b8 \\\\u3067 \\\\u3001 \\\\u58c1 \\\\u3...\", \"hypothesis\": \"\\\"\\\\u7537 \\\\u306f \\\\u9b54\\\\u6cd5 \\\\u306e \\\\u30b7\\\\u30e7\\\\u3...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "== \u65e5\u672c\u8a9eSNLI(JSNLI)\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 ==\n\nSNLI \u30b3\u30fc\u30d1\u30b9\u3092\u65e5\u672c\u8a9e\u306b\u7ffb\u8a33\u3057\u305f\u81ea\u7136\u8a00\u8a9e\u63a8\u8ad6\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\n\u5b66\u7fd2\u30c7\u30fc\u30bf\u306f\u5143\u30c7\u30fc\u30bf\u3092\u7ffb\u8a33\u3057\u3001\u8a08\u7b97\u6a5f\u306b\u3088\u308b\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u306b\u3088\u3063\u3066\u4f5c\u6210\n\u8a55\u4fa1\u30c7\u30fc\u30bf\u306f\u65e5\u672c\u8a9e\u3068\u3057\u3066\u610f\u5473\u304c\u901a\u308b\u304b\u3001\u7ffb\u8a33\u5f8c\u306e\u30e9\u30d9\u30eb\u304c\u5143\u306e\u30e9\u30d9\u30eb\u3068\u4e00\u81f4\u3057\u3066\u3044\u308b\u304b\u3069\u3046\u304b\u306e2\u6bb5\u968e\u306e\u30af\u30e9\u30a6\u30c9\u30bd\u30fc\u30b7\u30f3\u30b0\u306b\u3088\u308a\u30c7\u30fc\u30bf\u3092\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\n", "dataset_name": "shunk031/jsnli"}, "without-filtering": {"config_name": "without-filtering", "sample_row": "{\"premise\": \"\\\"\\\\u30ac\\\\u30ec\\\\u30fc\\\\u30b8 \\\\u3067 \\\\u3001 \\\\u58c1 \\\\u3...\", \"hypothesis\": \"\\\"\\\\u7537 \\\\u306f \\\\u9b54\\\\u6cd5 \\\\u306e \\\\u30b7\\\\u30e7\\\\u3...\", \"label\": \"1\"}", "columns": ["premise", "hypothesis", "label"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"}, "dataset_description": "== \u65e5\u672c\u8a9eSNLI(JSNLI)\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 ==\n\nSNLI \u30b3\u30fc\u30d1\u30b9\u3092\u65e5\u672c\u8a9e\u306b\u7ffb\u8a33\u3057\u305f\u81ea\u7136\u8a00\u8a9e\u63a8\u8ad6\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\n\u5b66\u7fd2\u30c7\u30fc\u30bf\u306f\u5143\u30c7\u30fc\u30bf\u3092\u7ffb\u8a33\u3057\u3001\u8a08\u7b97\u6a5f\u306b\u3088\u308b\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\u306b\u3088\u3063\u3066\u4f5c\u6210\n\u8a55\u4fa1\u30c7\u30fc\u30bf\u306f\u65e5\u672c\u8a9e\u3068\u3057\u3066\u610f\u5473\u304c\u901a\u308b\u304b\u3001\u7ffb\u8a33\u5f8c\u306e\u30e9\u30d9\u30eb\u304c\u5143\u306e\u30e9\u30d9\u30eb\u3068\u4e00\u81f4\u3057\u3066\u3044\u308b\u304b\u3069\u3046\u304b\u306e2\u6bb5\u968e\u306e\u30af\u30e9\u30a6\u30c9\u30bd\u30fc\u30b7\u30f3\u30b0\u306b\u3088\u308a\u30c7\u30fc\u30bf\u3092\u30d5\u30a3\u30eb\u30bf\u30ea\u30f3\u30b0\n", "dataset_name": "shunk031/jsnli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "task_ids:multi-input-text-classification", "multilinguality:monolingual", "language:ja", "natural-language-inference", "nli", "jsnli"], "is_gated": false}, "its5Q/yandex-q": {"dataset_name": "its5Q/yandex-q", "description": "This is a dataset of questions and answers scraped from Yandex.Q.", "downloads": 43, "configs": {"default": {"config_name": "default", "sample_row": "{\"description\": \"\\\"\\\"\", \"question\": \"\\\"\\\\u041a\\\\u0430\\\\u043a \\\\u0432\\\\u043e\\\\u0439\\\\u0442\\\\u0438...\", \"answer\": \"\\\"\\\\u041d\\\\u0438\\\\u043a\\\\u0430\\\\u043a \\\\u043d\\\\u043e \\\\u043...\"}", "columns": ["description", "question", "answer"], "columns_mapping": {"description": "description", "question": "question", "answer": "answer"}, "dataset_description": "This is a dataset of questions and answers scraped from Yandex.Q.\n", "dataset_name": "its5Q/yandex-q"}}, "tags": ["task_categories:text-generation", "task_categories:question-answering", "task_ids:language-modeling", "task_ids:open-domain-qa", "annotations_creators:crowdsourced", "multilinguality:monolingual", "source_datasets:original", "language:ru"], "is_gated": false}, "RobotsMaliAI/bayelemabaga": {"dataset_name": "RobotsMaliAI/bayelemabaga", "description": "The Bayelemabaga dataset is a collection of 44160 aligned machine translation ready Bambara-French lines, \noriginating from Corpus Bambara de Reference. The dataset is constitued of text extracted from 231 source files, \nvaring from periodicals, books, short stories, blog posts, part of the Bible and the Quran.", "downloads": 58, "configs": {"bam-fr": {"config_name": "bam-fr", "sample_row": "{\"translation.bam\": \"\\\"Mieru Baa ka maana. Ayiwa!\\\"\", \"translation.fr\": \"\\\"Le recit de Mieru Baa Eh bien! Fanta Maa.\\\"\"}", "columns": ["translation_bam", "translation_fr"], "columns_mapping": {"translation.bam": "translation_bam", "translation.fr": "translation_fr"}, "dataset_description": "The Bayelemabaga dataset is a collection of 44160 aligned machine translation ready Bambara-French lines, \noriginating from Corpus Bambara de Reference. The dataset is constitued of text extracted from 231 source files, \nvaring from periodicals, books, short stories, blog posts, part of the Bible and the Quran.\n", "dataset_name": "RobotsMaliAI/bayelemabaga"}, "fr-bam": {"config_name": "fr-bam", "sample_row": "{\"translation.fr\": \"\\\"Le recit de Mieru Baa Eh bien! Fanta Maa.\\\"\", \"translation.bam\": \"\\\"Mieru Baa ka maana. Ayiwa!\\\"\"}", "columns": ["translation_fr", "translation_bam"], "columns_mapping": {"translation.fr": "translation_fr", "translation.bam": "translation_bam"}, "dataset_description": "The Bayelemabaga dataset is a collection of 44160 aligned machine translation ready Bambara-French lines, \noriginating from Corpus Bambara de Reference. The dataset is constitued of text extracted from 231 source files, \nvaring from periodicals, books, short stories, blog posts, part of the Bible and the Quran.\n", "dataset_name": "RobotsMaliAI/bayelemabaga"}}, "tags": ["task_categories:translation", "task_categories:text-generation", "language:bm", "language:fr"], "is_gated": false}, "ipipan/nkjp1m": {"dataset_name": "ipipan/nkjp1m", "description": "This is the official dataset for NKJP1M \u2013 the 1-million token subcorpus of the\nNational Corpus of Polish (Narodowy Korpus J\u0119zyka Polskiego)\n\nBesides the text (divided into paragraphs/samples and sentences) the\nset contains lemmas and morpho-syntactic tags for all tokens in the corpus.\n\nThis release corresponds to the version 1.2 of the corpus with\nfollowing corrections and improvements. In particular the\nmorpho-syntactic annotation has been aligned with the present version\nof Morfeusz2 morphological analyser.", "downloads": 14, "configs": {"nkjp1m": {"config_name": "nkjp1m", "sample_row": "{\"nkjp_text\": \"\\\"NKJP_1M_0102000000001\\\"\", \"nkjp_par\": \"\\\"morph_1-p\\\"\", \"nkjp_sent\": \"\\\"morph_1.57-s\\\"\", \"tokens\": \"[\\\"Zatrzasn\\\\u0105\\\\u0142\\\", \\\"drzwi\\\", \\\"od\\\", \\\"mieszkani...\", \"lemmas\": \"[\\\"zatrzasn\\\\u0105\\\\u0107\\\", \\\"drzwi\\\", \\\"od\\\", \\\"mieszkani...\", \"cposes\": \"[11, 6, 9, 6, 10, 7, 6, 11, 6, 10, 11, 6, 10, 2, 1...\", \"poses\": \"[30, 35, 32, 35, 19, 20, 35, 30, 35, 19, 30, 35, 1...\", \"tags\": \"[869, 910, 888, 975, 266, 277, 907, 869, 961, 266,...\", \"nps\": \"[false, false, false, false, true, false, false, f...\", \"nkjp_ids\": \"[\\\"morph_1.1-seg\\\", \\\"morph_1.2-seg\\\", \\\"morph_1.3-seg\\\"...\"}", "columns": ["nkjp_text", "nkjp_par", "nkjp_sent", "tokens", "lemmas", "cposes", "poses", "tags", "nps", "nkjp_ids"], "columns_mapping": {"nkjp_text": "nkjp_text", "nkjp_par": "nkjp_par", "nkjp_sent": "nkjp_sent", "tokens": "tokens", "lemmas": "lemmas", "cposes": "cposes", "poses": "poses", "tags": "tags", "nps": "nps", "nkjp_ids": "nkjp_ids"}, "dataset_description": "This is the official dataset for NKJP1M \u2013 the 1-million token subcorpus of the\nNational Corpus of Polish (Narodowy Korpus J\u0119zyka Polskiego)\n\nBesides the text (divided into paragraphs/samples and sentences) the\nset contains lemmas and morpho-syntactic tags for all tokens in the corpus.\n\nThis release corresponds to the version 1.2 of the corpus with\nfollowing corrections and improvements. In particular the\nmorpho-syntactic annotation has been aligned with the present version\nof Morfeusz2 morphological analyser.\n\n", "dataset_name": "ipipan/nkjp1m"}}, "tags": ["task_categories:token-classification", "task_ids:part-of-speech", "task_ids:lemmatization", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pl", "National Corpus of Polish", "Narodowy Korpus J\u0119zyka Polskiego"], "is_gated": false}, "masakhane/masakhaner2": {"dataset_name": "masakhane/masakhaner2", "description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811", "downloads": 1145, "configs": {"bam": {"config_name": "bam", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Nin\\\", \\\"waati\\\", \\\"in\\\", \\\"na\\\", \\\",\\\", \\\"a\\\", \\\"ka\\\", \\\"g\\\\u0...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "bbj": {"config_name": "bbj", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Msa\\\\u02bcny\\\\u0259\\\\u0302\\\", \\\"g\\\\u0254ti\\\\u0301\\\", \\\"cy...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "ewe": {"config_name": "ewe", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Le\\\", \\\"kwasi\\\\u0256a\\\", \\\"si\\\", \\\"va\\\", \\\"yi\\\", \\\"me\\\", \\\"la...\", \"ner_tags\": \"[0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "fon": {"config_name": "fon", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Dot\\\\u00f3oxw\\\\u00e9\\\", \\\"\\\\u0254\\\\u0301\\\", \\\"\\\\u0256\\\\u00...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "hau": {"config_name": "hau", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Olurukoba\\\", \\\"ya\\\", \\\"ce\\\", \\\"hukumar\\\", \\\"ta\\\", \\\"kwasta...\", \"ner_tags\": \"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "ibo": {"config_name": "ibo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u1ecc\\\", \\\"d\\\\u1ecb\\\", \\\"\\\\u1ecdt\\\\u1ee5t\\\\u1ee5\\\", \\\"ihe...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "kin": {"config_name": "kin", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Miss\\\", \\\"Nimwiza\\\", \\\"yavuze\\\", \\\"ko\\\", \\\"hari\\\", \\\"imish...\", \"ner_tags\": \"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "lug": {"config_name": "lug", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Obude\\\", \\\"bwali\\\", \\\"bunnyogovu\\\", \\\"nnyo\\\", \\\"we\\\", \\\"tw...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "luo": {"config_name": "luo", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Nyoriwoni\\\", \\\"ne\\\", \\\"oketo\\\", \\\"apisgi\\\", \\\"maduong'\\\",...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "mos": {"config_name": "mos", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"R\\\\u1ebd\\\", \\\"f\\\\u00e3a\\\", \\\"ne\\\", \\\"no\\\", \\\"-\\\", \\\"r\\\\u0269k...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "nya": {"config_name": "nya", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ukwati\\\", \\\"ndiye\\\", \\\"adamanga\\\", \\\"pa\\\", \\\"4\\\", \\\"Octobe...\", \"ner_tags\": \"[0, 0, 0, 0, 7, 8, 8, 0, 0, 3, 4, 4, 0, 5, 0, 0]...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "pcm": {"config_name": "pcm", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Di\\\", \\\"man\\\", \\\"go\\\", \\\"Twitter\\\", \\\"go\\\", \\\"reveal\\\", \\\"hi...\", \"ner_tags\": \"[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "sna": {"config_name": "sna", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Mumatunhu\\\", \\\"mapfumbamwe\\\", \\\"akavhiringwa\\\", \\\"neCy...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "swa": {"config_name": "swa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Maafisa\\\", \\\"wa\\\", \\\"serikali\\\", \\\"ya\\\", \\\"Yemen\\\", \\\"wame...\", \"ner_tags\": \"[0, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "tsn": {"config_name": "tsn", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"E\\\", \\\"ne\\\", \\\"e\\\", \\\"le\\\", \\\"motlotli\\\", \\\"wa\\\", \\\"dikgang\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "twi": {"config_name": "twi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Mmom\\\", \\\"obi\\\", \\\"a\\\", \\\"\\\\u0254w\\\\u0254\\\", \\\"ahobr\\\\u025b...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "wol": {"config_name": "wol", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u00d1\\\\u00ebwoon\\\", \\\"teewee\\\", \\\"daraap\\\\u00f3o\\\", \\\"n...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 1...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "xho": {"config_name": "xho", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Konakala\\\", \\\"izinto\\\", \\\"emsebenzini\\\", \\\"emva\\\", \\\"kok...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "yor": {"config_name": "yor", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ba\\\\u0300ba\\\\u0301\\\", \\\"to\\\\u0301\\\", \\\"bi\\\\u0301\\\", \\\"Ba\\\\u...\", \"ner_tags\": \"[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}, "zul": {"config_name": "zul", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"Ngesizini\\\", \\\"edlule\\\", \\\"baphelela\\\", \\\"endaweni\\\", \\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags"}, "dataset_description": "MasakhaNER 2.0 is the largest publicly available high-quality dataset for named entity recognition (NER) in 20 African languages.\n\nNamed entities are phrases that contain the names of persons, organizations, locations, times and quantities.\n\nExample:\n[PER Wolff] , currently a journalist in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .\nMasakhaNER is a named entity dataset consisting of PER, ORG, LOC, and DATE entities annotated by Masakhane for 20 African languages:\n- Bambara (bam)\n- Ghomala (bbj)\n- Ewe (ewe)\n- Fon (fon)\n- Hausa (hau)\n- Igbo (ibo)\n- Kinyarwanda (kin)\n- Luganda (lug)\n- Dholuo (luo) \n- Mossi (mos)\n- Chichewa (nya)\n- Nigerian Pidgin\n- chShona (sna)\n- Kiswahili (sw\u0105)\n- Setswana (tsn)\n- Twi (twi)\n- Wolof (wol)\n- isiXhosa (xho)\n- Yor\u00f9b\u00e1 (yor)\n- isiZulu (zul)\n\nThe train/validation/test sets are available for all the ten languages.\n\nFor more details see https://arxiv.org/abs/2103.11811\n", "dataset_name": "masakhane/masakhaner2"}}, "tags": ["task_categories:token-classification", "task_ids:named-entity-recognition", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:bm", "language:bbj", "language:ee", "language:fon", "language:ha", "language:ig", "language:rw", "language:lg", "language:luo", "language:mos", "language:ny", "language:pcm", "language:sn", "language:sw", "language:tn", "language:tw", "language:wo", "language:xh", "language:yo", "language:zu", "ner", "masakhaner", "masakhane"], "is_gated": false}, "ipipan/polqa": {"dataset_name": "ipipan/polqa", "description": "PolQA is the first Polish dataset for OpenQA. It consists of 7,000 questions, 87,525 manually labeled evidence passages, and a corpus of over 7 million candidate passages.", "downloads": 79, "configs": {"pairs": {"config_name": "pairs", "sample_row": "{\"question_id\": \"1\", \"passage_title\": \"\\\"Alfa\\\"\", \"passage_text\": \"\\\"Alfa (\\\\u1f04\\\\u03bb\\\\u03c6\\\\u03b1, pisana \\\\u0391\\\\u03...\", \"passage_wiki\": \"\\\"Alfa (\\\\u1f04\\\\u03bb\\\\u03c6\\\\u03b1, pisana \\\\u0391\\\\u03...\", \"passage_id\": \"\\\"19291-0\\\"\", \"duplicate\": \"true\", \"question\": \"\\\"Jak nazywa si\\\\u0119 pierwsza litera alfabetu grec...\", \"relevant\": \"true\", \"annotated_by\": \"\\\"Igor\\\"\", \"answers\": \"\\\"['alfa']\\\"\", \"question_formulation\": \"\\\"QUESTION\\\"\", \"question_type\": \"\\\"SINGLE ENTITY\\\"\", \"entity_type\": \"\\\"UNNAMED\\\"\", \"entity_subtype\": \"\\\"-\\\"\", \"split\": \"\\\"train\\\"\", \"passage_source\": \"\\\"zero-shot\\\"\"}", "columns": ["question_id", "passage_title", "passage_text", "passage_wiki", "passage_id", "duplicate", "question", "relevant", "annotated_by", "answers", "question_formulation", "question_type", "entity_type", "entity_subtype", "split", "passage_source"], "columns_mapping": {"question_id": "question_id", "passage_title": "passage_title", "passage_text": "passage_text", "passage_wiki": "passage_wiki", "passage_id": "passage_id", "duplicate": "duplicate", "question": "question", "relevant": "relevant", "annotated_by": "annotated_by", "answers": "answers", "question_formulation": "question_formulation", "question_type": "question_type", "entity_type": "entity_type", "entity_subtype": "entity_subtype", "split": "split", "passage_source": "passage_source"}, "dataset_description": "PolQA is the first Polish dataset for OpenQA. It consists of 7,000 questions, 87,525 manually labeled evidence passages, and a corpus of over 7 million candidate passages.\n", "dataset_name": "ipipan/polqa"}, "passages": {"config_name": "passages", "sample_row": "{\"id\": \"\\\"2-0\\\"\", \"title\": \"\\\"AWK\\\"\", \"text\": \"\\\"AWK \\\\u2013 interpretowany j\\\\u0119zyk programowani...\"}", "columns": ["id", "title", "text"], "columns_mapping": {"id": "id", "title": "title", "text": "text"}, "dataset_description": "PolQA is the first Polish dataset for OpenQA. It consists of 7,000 questions, 87,525 manually labeled evidence passages, and a corpus of over 7 million candidate passages.\n", "dataset_name": "ipipan/polqa"}}, "tags": ["task_categories:question-answering", "task_categories:text-retrieval", "task_categories:text2text-generation", "task_ids:open-domain-qa", "task_ids:document-retrieval", "task_ids:abstractive-qa", "annotations_creators:expert-generated", "language:pl"], "is_gated": false}, "orai-nlp/basqueGLUE": {"dataset_name": "orai-nlp/basqueGLUE", "description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.", "downloads": 55, "configs": {"bec": {"config_name": "bec", "sample_row": "{\"text\": \"\\\"Retweeted EH Bildu Bizkaia (@ehbildubizkaia):\\\\\\\\n\\\\...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "bhtc": {"config_name": "bhtc", "sample_row": "{\"text\": \"\\\"Diru-Sarrerak Bermatzeko Errenta (DSBE, gaztelera...\", \"label\": \"3\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "coref": {"config_name": "coref", "sample_row": "{\"text\": \"\\\"HIROMU NONAKA Japonian Gobernuan dagoen LDP Alder...\", \"span1_text\": \"\\\"HIROMU NONAKA Japonian Gobernuan dagoen LDP Alder...\", \"span2_text\": \"\\\"oposizioak\\\"\", \"label\": \"0\", \"span1_index\": \"0\", \"span2_index\": \"36\", \"idx\": \"0\"}", "columns": ["text", "span1_text", "span2_text", "label", "span1_index", "span2_index", "idx"], "columns_mapping": {"text": "text", "span1_text": "span1_text", "span2_text": "span2_text", "label": "label", "span1_index": "span1_index", "span2_index": "span2_index", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "intent": {"config_name": "intent", "sample_row": "{\"text\": \"\\\"aldatu alarma 7am-tik 7pm-ra , mesedez\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "nerc_id": {"config_name": "nerc_id", "sample_row": "{\"tokens\": \"[\\\"Greba\\\", \\\"orokorrera\\\", \\\"deitu\\\", \\\"du\\\", \\\"EHk\\\", \\\"27r...\", \"tags\": \"[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\", \"idx\": \"0\"}", "columns": ["tokens", "tags", "idx"], "columns_mapping": {"tokens": "tokens", "tags": "tags", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "nerc_od": {"config_name": "nerc_od", "sample_row": "{\"tokens\": \"[\\\"Greba\\\", \\\"orokorrera\\\", \\\"deitu\\\", \\\"du\\\", \\\"EHk\\\", \\\"27r...\", \"tags\": \"[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\", \"idx\": \"0\"}", "columns": ["tokens", "tags", "idx"], "columns_mapping": {"tokens": "tokens", "tags": "tags", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "qnli": {"config_name": "qnli", "sample_row": "{\"question\": \"\\\"Orain zer ikertzen ari da?\\\"\", \"sentence\": \"\\\"Hedabide askotan kolaboratu du, gehienak parapsik...\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["question", "sentence", "label", "idx"], "columns_mapping": {"question": "question", "sentence": "sentence", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "slot": {"config_name": "slot", "sample_row": "{\"tokens\": \"[\\\"aldatu\\\", \\\"alarma\\\", \\\"7am-tik\\\", \\\"7pm-ra\\\", \\\",\\\", \\\"me...\", \"tags\": \"[0, 0, 1, 12, 0, 0]\", \"idx\": \"0\"}", "columns": ["tokens", "tags", "idx"], "columns_mapping": {"tokens": "tokens", "tags": "tags", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "vaxx": {"config_name": "vaxx", "sample_row": "{\"text\": \"\\\"\\\\\\\"#COVID19 Oraingo datuak, izurriaren dinamika, t...\", \"label\": \"2\", \"idx\": \"0\"}", "columns": ["text", "label", "idx"], "columns_mapping": {"text": "text", "label": "label", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}, "wic": {"config_name": "wic", "sample_row": "{\"sentence1\": \"\\\"Egun hauetan Atlantako zilarraz galdetu diogun ba...\", \"sentence2\": \"\\\"Lance Armstrong eta Jan Ullrich ziren guztian aho...\", \"word\": \"\\\"itxaropen\\\"\", \"label\": \"0\", \"start1\": \"149\", \"start2\": \"89\", \"end1\": \"159\", \"end2\": \"100\", \"idx\": \"0\"}", "columns": ["sentence1", "sentence2", "word", "label", "start1", "start2", "end1", "end2", "idx"], "columns_mapping": {"sentence1": "sentence1", "sentence2": "sentence2", "word": "word", "label": "label", "start1": "start1", "start2": "start2", "end1": "end1", "end2": "end2", "idx": "idx"}, "dataset_description": "We present BasqueGLUE, the first NLU benchmark for Basque, which has been elaborated from \npreviously existing datasets and following similar criteria to those used for the construction of \nGLUE and SuperGLUE. BasqueGLUE is freely available under an open license.\n", "dataset_name": "orai-nlp/basqueGLUE"}}, "tags": ["task_categories:text-classification", "task_categories:token-classification", "task_ids:intent-classification", "task_ids:natural-language-inference", "task_ids:sentiment-classification", "task_ids:topic-classification", "task_ids:named-entity-recognition", "task_ids:coreference-resolution", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:eu"], "is_gated": false}, "neulab/docprompting-conala": {"dataset_name": "neulab/docprompting-conala", "description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "downloads": 1014, "configs": {"data": {"config_name": "data", "sample_row": "{\"question_id\": \"\\\"348196-52\\\"\", \"nl\": \"\\\"Create list `instancelist` containing 29 objects ...\", \"cmd\": \"\\\"instancelist = [MyClass() for i in range(29)]\\\"\", \"oracle_man\": \"[\\\"python.library.functions#range\\\"]\", \"canonical_cmd\": \"\\\"VAR_STR = [MyClass() for i in range(29)]\\\"\", \"cmd_name\": \"\\\"conala\\\"\"}", "columns": ["question_id", "nl", "cmd", "oracle_man", "canonical_cmd", "cmd_name"], "columns_mapping": {"question_id": "question_id", "nl": "nl", "cmd": "cmd", "oracle_man": "oracle_man", "canonical_cmd": "canonical_cmd", "cmd_name": "cmd_name"}, "dataset_description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "dataset_name": "neulab/docprompting-conala"}, "docs": {"config_name": "docs", "sample_row": "{\"doc_id\": \"\\\"tensorflow.aggregationmethod\\\"\", \"doc_content\": \"\\\"tf.AggregationMethod View source on GitHub ...\"}", "columns": ["doc_id", "doc_content"], "columns_mapping": {"doc_id": "doc_id", "doc_content": "doc_content"}, "dataset_description": "This is the re-split of CoNaLa dataset. For each code snippet in the dev and test set, at least one function is held out from the training set. This split aims at testing a code generation model's capacity in generating unseen functions.\nWe further make sure that examples from the same StackOverflow post (same question_id before -) are in the same split.", "dataset_name": "neulab/docprompting-conala"}}, "tags": ["task_categories:text2text-generation", "multilinguality:monolingual", "source_datasets:original", "language:code", "code-generation", "doc retrieval", "retrieval augmented generation"], "is_gated": false}, "aashsach/multiconer2": {"dataset_name": "aashsach/multiconer2", "description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition", "downloads": 37, "configs": {"bn": {"config_name": "bn", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u09b8\\\\u09cd\\\\u099f\\\\u09c7\\\\u09b6\\\\u09a8\\\\u099f\\\\u09bf...\", \"ner_tags\": \"[0, 41, 42, 42, 0, 0]\", \"ner_macro_tags\": \"[0, 9, 10, 10, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "de": {"config_name": "de", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"das\\\", \\\"geb\\\\u00e4ude\\\", \\\"hatte\\\", \\\"bis\\\", \\\"1984\\\", \\\"e...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "en": {"config_name": "en", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"robert\\\", \\\"gottschalk\\\", \\\"1939\\\", \\\"academy\\\", \\\"award...\", \"ner_tags\": \"[39, 40, 0, 63, 64, 0, 0, 0, 0, 35]\", \"ner_macro_tags\": \"[7, 8, 0, 3, 4, 0, 0, 0, 0, 5]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "es": {"config_name": "es", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u00e9douard\\\", \\\"herriot\\\", \\\"ou\\\", \\\"la\\\", \\\"r\\\\u00e9pu...\", \"ner_tags\": \"[43, 44, 0, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[7, 8, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "fa": {"config_name": "fa", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u06f2\\\\u06f0\\\\u06f1\\\\u06f0\\\", \\\"\\\\u060c\\\", \\\"\\\\u0633\\\\u06...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 25]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 1]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "fr": {"config_name": "fr", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"elle\\\", \\\"porte\\\", \\\"le\\\", \\\"nom\\\", \\\"de\\\", \\\"la\\\", \\\"romanc...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "hi": {"config_name": "hi", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u092f\\\\u0939\\\", \\\"\\\\u091d\\\\u093f\\\\u092f\\\\u093e\\\\u0928\\\",...\", \"ner_tags\": \"[0, 25, 25, 0, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[0, 1, 1, 0, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "it": {"config_name": "it", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"paesaggio\\\", \\\"con\\\", \\\"figura\\\", \\\"(\\\", \\\"1865\\\", \\\"1885\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 22, 22, 22, 0]...\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "pt": {"config_name": "pt", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"os\\\", \\\"moradores\\\", \\\"decidiram\\\", \\\"ent\\\\u00e3o\\\", \\\"do...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, ...\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "sv": {"config_name": "sv", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"nils\\\", \\\"perne\\\", \\\"svensk\\\", \\\"komposit\\\\u00f6r\\\", \\\"te...\", \"ner_tags\": \"[7, 8, 0, 0, 0, 0, 0]\", \"ner_macro_tags\": \"[7, 8, 0, 0, 0, 0, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "uk": {"config_name": "uk", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u043b\\\\u044c\\\\u0432\\\\u0443\\\\u0432\\\\u0435\\\\u043a\\\", \\\"(\\\"...\", \"ner_tags\": \"[0, 0, 0, 0, 0, 0, 0, 25, 0]\", \"ner_macro_tags\": \"[0, 0, 0, 0, 0, 0, 0, 1, 0]\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}, "zh": {"config_name": "zh", "sample_row": "{\"id\": \"\\\"0\\\"\", \"tokens\": \"[\\\"\\\\u5167\\\", \\\"\\\\u7a46\\\", \\\"\\\\u723e\\\", \\\"\\\\u00b7\\\", \\\"\\\\u54c8\\\",...\", \"ner_tags\": \"[43, 44, 44, 44, 44, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0...\", \"ner_macro_tags\": \"[7, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...\"}", "columns": ["id", "tokens", "ner_tags", "ner_macro_tags"], "columns_mapping": {"id": "id", "tokens": "tokens", "ner_tags": "ner_tags", "ner_macro_tags": "ner_macro_tags"}, "dataset_description": "SemEval 2023 Task 2: MultiCoNER II\nMultilingual Complex Named Entity Recognition\n", "dataset_name": "aashsach/multiconer2"}}, "tags": [], "is_gated": false}, "eloukas/edgar-corpus": {"dataset_name": "eloukas/edgar-corpus", "description": "The dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).", "downloads": 397, "configs": {"full": {"config_name": "full", "sample_row": "{\"filename\": \"\\\"92116_1993.txt\\\"\", \"cik\": \"\\\"92116\\\"\", \"year\": \"\\\"1993\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral\\\\nSouthern California Wa...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2 - Properties\\\\nFranchises, Competition, Acq...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nOn October 20, 1993, t...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nInformation resp...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nInformation resp...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1993": {"config_name": "year_1993", "sample_row": "{\"filename\": \"\\\"92116_1993.txt\\\"\", \"cik\": \"\\\"92116\\\"\", \"year\": \"\\\"1993\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral\\\\nSouthern California Wa...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2 - Properties\\\\nFranchises, Competition, Acq...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nOn October 20, 1993, t...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nInformation resp...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nInformation resp...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1994": {"config_name": "year_1994", "sample_row": "{\"filename\": \"\\\"814677_1994.txt\\\"\", \"cik\": \"\\\"814677\\\"\", \"year\": \"\\\"1994\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nA. Introduction\\\\n(i) Background...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nAt December 31, 1994, the Com...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nThe Company is involve...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR THE COMPANY'S COMMON EQUITY AN...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nSUMMARY OF SELEC...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTAL DAT...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nITEM 12.\\\"\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIPS OF CERTAIN BENEFICIA...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1995": {"config_name": "year_1995", "sample_row": "{\"filename\": \"\\\"823195_1995.txt\\\"\", \"cik\": \"\\\"823195\\\"\", \"year\": \"\\\"1995\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral Development of Business...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2. Properties\\\\nThe Partnership does not own ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nIn or about April 1993...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Limited Partnersh...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\n(dollars in thou...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe Partnership ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1996": {"config_name": "year_1996", "sample_row": "{\"filename\": \"\\\"319315_1996.txt\\\"\", \"cik\": \"\\\"319315\\\"\", \"year\": \"\\\"1996\\\"\", \"section_1\": \"\\\"ITEM 1 - Business\\\\nGeneral\\\\nThe response to this ...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2 - Properties\\\\nGeneral\\\\nThe Partnership's i...\", \"section_3\": \"\\\"ITEM 3 - Legal Proceedings\\\\nThere are no material...\", \"section_4\": \"\\\"ITEM 4 - Submission of Matters to a Vote of Secur...\", \"section_5\": \"\\\"ITEM 5 - Market for the Registrant's Common Equit...\", \"section_6\": \"\\\"ITEM 6 - Selected Financial Data\\\\nSelected financ...\", \"section_7\": \"\\\"ITEM 7 - Management's Discussion and Analysis of ...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8 - Financial Statements and Supplementary D...\", \"section_9\": \"\\\"ITEM 9 - Changes in and Disagreements with Accoun...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10 - Directors and Executive Officers of the...\", \"section_11\": \"\\\"ITEM 11 - Executive Compensation\\\\nThe Partnership...\", \"section_12\": \"\\\"ITEM 12 - Security Ownership of Certain Beneficia...\", \"section_13\": \"\\\"ITEM 13 - Certain Relationships and Related Trans...\", \"section_14\": \"\\\"ITEM 14 - Financial Statements, Schedules, Exhibi...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1997": {"config_name": "year_1997", "sample_row": "{\"filename\": \"\\\"820736_1997.txt\\\"\", \"cik\": \"\\\"820736\\\"\", \"year\": \"\\\"1997\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nBACKGROUND\\\\nOrbital Sciences Co...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nOrbital owns or leases over 1...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nOn October 10, 1996, T...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY AND...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe information ...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nThe information ...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1998": {"config_name": "year_1998", "sample_row": "{\"filename\": \"\\\"887919_1998.txt\\\"\", \"cik\": \"\\\"887919\\\"\", \"year\": \"\\\"1998\\\"\", \"section_1\": \"\\\"ITEM 1. DESCRIPTION OF BUSINESS\\\\nTHE COMPANY\\\\nPre...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nThe Company owns 115 North Ha...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nThe Banks are respecti...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY AND...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe following ta...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"\\\"\", \"section_11\": \"\\\"\\\"\", \"section_12\": \"\\\"\\\"\", \"section_13\": \"\\\"\\\"\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES ...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_1999": {"config_name": "year_1999", "sample_row": "{\"filename\": \"\\\"854864_1999.txt\\\"\", \"cik\": \"\\\"854864\\\"\", \"year\": \"\\\"1999\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral Description of Partners...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2. Nonoperating Interests in Properties\\\\nAs ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThe Partnership is not...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market Price of and Distributions on the ...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThe following se...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Disagreements on Accounting and Financial...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nAs noted in Item...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"\\\"\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2000": {"config_name": "year_2000", "sample_row": "{\"filename\": \"\\\"1064728_2000.txt\\\"\", \"cik\": \"\\\"1064728\\\"\", \"year\": \"\\\"2000\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS.\\\\nOVERVIEW\\\\nWe are the world's l...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES.\\\\nCOAL RESERVES\\\\nWe had an est...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS.\\\\nFrom time to time, we...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY AND...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA.\\\\nP&L Coal Holdin...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION.\\\\nThe following t...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2001": {"config_name": "year_2001", "sample_row": "{\"filename\": \"\\\"18072_2001.htm\\\"\", \"cik\": \"\\\"18072\\\"\", \"year\": \"\\\"2001\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral\\\\nCascade Natural Gas Co...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2. Properties\\\\nAt September 30, 2001, Cascad...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nIncorporated herein by...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nItem 7.\\\"\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nReference is mad...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Exhibits, Financial Statement Schedules,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2002": {"config_name": "year_2002", "sample_row": "{\"filename\": \"\\\"1121980_2002.htm\\\"\", \"cik\": \"\\\"1121980\\\"\", \"year\": \"\\\"2002\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nOVERVIEW\\\\nHPL Technologies, Inc...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nThe following table sets for ...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nNone.\\\\nITEM 4.\\\"\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR COMMON STOCK AND RELATED STOCK...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe selected con...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS OF THE ...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nThere is incorpo...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. EXHIBITS, FINANCIAL STATEMENT SCHEDULES,...\", \"section_15\": \"\\\"\\\"\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2003": {"config_name": "year_2003", "sample_row": "{\"filename\": \"\\\"1224874_2003.txt\\\"\", \"cik\": \"\\\"1224874\\\"\", \"year\": \"\\\"2003\\\"\", \"section_1\": \"\\\"ITEM 1. Business.\\\\nNot Applicable\\\\nITEM 2.\\\"\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. Properties.\\\\nNot Applicable\\\\nITEM 3.\\\"\", \"section_3\": \"\\\"ITEM 3. Legal Proceedings.\\\\nNone.\\\\nITEM 4.\\\"\", \"section_4\": \"\\\"ITEM 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"ITEM 5. Market for Registrant's Common Equity and...\", \"section_6\": \"\\\"ITEM 6. Selected Financial Data.\\\\nNot Applicable\\\\...\", \"section_7\": \"\\\"ITEM 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"ITEM 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"ITEM 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"ITEM 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"ITEM 9A. Controls and Procedures\\\\nNot Applicable\\\\...\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. Directors and Executive Officers of Regi...\", \"section_11\": \"\\\"ITEM 11. Executive Compensation.\\\\nNot Applicable....\", \"section_12\": \"\\\"ITEM 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"ITEM 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"ITEM 14. Principal Accountant Fees and Services N...\", \"section_15\": \"\\\"ITEM 15. Exhibits, Financial Statement Schedules,...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2004": {"config_name": "year_2004", "sample_row": "{\"filename\": \"\\\"1287258_2004.htm\\\"\", \"cik\": \"\\\"1287258\\\"\", \"year\": \"\\\"2004\\\"\", \"section_1\": \"\\\"Item 1.\\\\nBusiness\\\\nCompany Overview\\\\nWe are a lea...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"Item 2.\\\\nProperties\\\\nWe are headquartered in San ...\", \"section_3\": \"\\\"Item 3.\\\\nLegal Proceedings\\\\nIn November 2000, a f...\", \"section_4\": \"\\\"Item 4.\\\\nSubmission of Matters to a Vote of Secur...\", \"section_5\": \"\\\"Item 5.\\\\nMarket for Registrant\\\\u2019s Common Equi...\", \"section_6\": \"\\\"Item 6.\\\\nSelected Financial Data\\\\nThe selected fi...\", \"section_7\": \"\\\"Item 7.\\\\nManagement\\\\u2019s Discussion and Analysi...\", \"section_7A\": \"\\\"Item 7A.\\\\nQuantitative and Qualitative Disclosure...\", \"section_8\": \"\\\"Item 8.\\\\nFinancial Statements and Supplementary D...\", \"section_9\": \"\\\"Item 9.\\\\nChanges in and Disagreements with Accoun...\", \"section_9A\": \"\\\"Item 9A.\\\\nControl and Procedures\\\\nWe maintain dis...\", \"section_9B\": \"\\\"Item 9B.\\\\nOther Information\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10.\\\\nDirectors and Executive Officers of the...\", \"section_11\": \"\\\"Item 11.\\\\nExecutive Compensation\\\\nIncorporated by...\", \"section_12\": \"\\\"Item 12.\\\\nSecurity Ownership of Certain Beneficia...\", \"section_13\": \"\\\"Item 13.\\\\nCertain Relationships and Related Trans...\", \"section_14\": \"\\\"Item 14.\\\\nPrincipal Accounting Fees and Services\\\\...\", \"section_15\": \"\\\"Item 15.\\\\nExhibits and Financial Statement Schedu...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2005": {"config_name": "year_2005", "sample_row": "{\"filename\": \"\\\"1319633_2005.txt\\\"\", \"cik\": \"\\\"1319633\\\"\", \"year\": \"\\\"2005\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nNot applicable. See the Relief...\", \"section_1A\": \"\\\"Item 1A. Risk Factors.\\\\nNot applicable.\\\\nItem 1B....\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments.\\\\nNone.\\\\nItem ...\", \"section_2\": \"\\\"Item 2. Properties.\\\\nNot applicable. See the Reli...\", \"section_3\": \"\\\"Item 3. Legal Proceedings.\\\\nThere were no materia...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity, Re...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nNot applicable....\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"Item 7A Quantitative and Qualitative Disclosures ...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures.\\\\nNot applicable...\", \"section_9B\": \"\\\"Item 9B. Other Information.\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10. Directors and Executive Officers of the ...\", \"section_11\": \"\\\"Item 11. Executive Compensation.\\\\nNot applicable....\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services.\\\\...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2006": {"config_name": "year_2006", "sample_row": "{\"filename\": \"\\\"1351893_2006.txt\\\"\", \"cik\": \"\\\"1351893\\\"\", \"year\": \"\\\"2006\\\"\", \"section_1\": \"\\\"\\\"\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"\\\"\", \"section_3\": \"\\\"\\\"\", \"section_4\": \"\\\"\\\"\", \"section_5\": \"\\\"\\\"\", \"section_6\": \"\\\"\\\"\", \"section_7\": \"\\\"\\\"\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"\\\"\", \"section_9\": \"\\\"\\\"\", \"section_9A\": \"\\\"\\\"\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION.\\\\nNone.\\\\nPART IV\\\\nITEM...\", \"section_10\": \"\\\"\\\"\", \"section_11\": \"\\\"\\\"\", \"section_12\": \"\\\"\\\"\", \"section_13\": \"\\\"\\\"\", \"section_14\": \"\\\"\\\"\", \"section_15\": \"\\\"ITEM 15. EXHIBITS AND FINANCIAL STATEMENT SCHEDUL...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2007": {"config_name": "year_2007", "sample_row": "{\"filename\": \"\\\"1178336_2007.htm\\\"\", \"cik\": \"\\\"1178336\\\"\", \"year\": \"\\\"2007\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nForward-Looking Statements\\\\nThi...\", \"section_1A\": \"\\\"ITEM 1A. RISK FACTORS\\\\nRisks Related to Our Busin...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nWe own approximately 2.2 acre...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nFrom time to time, we ...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThe following se...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Qualitative and Quantitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nEvaluation of D...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nReference is mad...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accountant Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statements Schedu...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2008": {"config_name": "year_2008", "sample_row": "{\"filename\": \"\\\"3906_2008.htm\\\"\", \"cik\": \"\\\"3906\\\"\", \"year\": \"\\\"2008\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nGeneral\\\\nWe are a business dev...\", \"section_1A\": \"\\\"Item 1A.\\\\nRisk Factors.\\\\nInvesting in Allied Capi...\", \"section_1B\": \"\\\"Item 1B.\\\\nUnresolved Staff Comments\\\\nNot applicab...\", \"section_2\": \"\\\"Item 2.\\\\nProperties.\\\\nOur principal offices are l...\", \"section_3\": \"\\\"Item 3.\\\\nLegal Proceedings.\\\\nOn June 23, 2004, we...\", \"section_4\": \"\\\"Item 4.\\\\nSubmission of Matters to a Vote of Secur...\", \"section_5\": \"\\\"Item 5.\\\\nMarket For Registrant\\\\u2019s Common Equi...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nSELECTED CONDEN...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosure ...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9.\\\\nChanges in and Disagreements with Accoun...\", \"section_9A\": \"\\\"Item 9A.\\\\nControls and Procedures.\\\\n(a) Evaluatio...\", \"section_9B\": \"\\\"Item 9B.\\\\nOther Information.\\\\nOn February 26, 200...\", \"section_10\": \"\\\"Item 10.\\\\nDirectors, Executive Officers and Corpo...\", \"section_11\": \"\\\"Item 11.\\\\nExecutive Compensation.\\\\nInformation in...\", \"section_12\": \"\\\"Item 12.\\\\nSecurity Ownership of Certain Beneficia...\", \"section_13\": \"\\\"Item 13.\\\\nCertain Relationships and Related Trans...\", \"section_14\": \"\\\"Item 14.\\\\nPrincipal Accountant Fees and Services....\", \"section_15\": \"\\\"Item 15.\\\\nExhibits and Financial Statement Schedu...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2009": {"config_name": "year_2009", "sample_row": "{\"filename\": \"\\\"907654_2009.htm\\\"\", \"cik\": \"\\\"907654\\\"\", \"year\": \"\\\"2009\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nSome of the statements under \\\\u...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nRisks Related to Our Busin...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNot applicabl...\", \"section_2\": \"\\\"Item 2. Properties\\\\nThe Company\\\\u2019s headquarte...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nOn February 9, 2007, N...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nNot applicable.\\\\...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A(T). Controls and Procedures\\\\nEvaluation o...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe response to ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accountant Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2010": {"config_name": "year_2010", "sample_row": "{\"filename\": \"\\\"1164888_2010.htm\\\"\", \"cik\": \"\\\"1164888\\\"\", \"year\": \"\\\"2010\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\n(A) BUSINESS DEVELOPMENT\\\\nKyto ...\", \"section_1A\": \"\\\"\\\"\", \"section_1B\": \"\\\"\\\"\", \"section_2\": \"\\\"ITEM 2. DESCRIPTION OF PROPERTY\\\\nThe Company occu...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nThere is no litigation...\", \"section_4\": \"\\\"ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SECURI...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR COMMON EQUITY AND RELATED STOC...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nEarnings per sha...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"\\\"\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nDisclosure Cont...\", \"section_9B\": \"\\\"\\\"\", \"section_10\": \"\\\"ITEM 10. DIRECTORS AND EXECUTIVE OFFICERS, PROMOT...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\n(A) SUMMARY COMP...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTANT FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS AND REPORTS ON FORM 8-K\\\\n(A) LI...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2011": {"config_name": "year_2011", "sample_row": "{\"filename\": \"\\\"1297341_2011.htm\\\"\", \"cik\": \"\\\"1297341\\\"\", \"year\": \"\\\"2011\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nForward-looking Statements\\\\nThi...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nWe may not be able to achi...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nWe conduct our business throu...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nFrom time to time, we ...\", \"section_4\": \"\\\"Item 4. [Removed and reserved]\\\\nNot applicable.\\\\n...\", \"section_5\": \"\\\"Item 5. Market for the Registrant\\\\u2019s Common E...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThis item is not...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nManagement is r...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNot Applicable.\\\\nPART...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers, and Corpo...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe information ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accountant Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2012": {"config_name": "year_2012", "sample_row": "{\"filename\": \"\\\"1121901_2012.htm\\\"\", \"cik\": \"\\\"1121901\\\"\", \"year\": \"\\\"2012\\\"\", \"section_1\": \"\\\"Item 1.\\\\nBusiness.\\\\nCompany Overview\\\\nInternation...\", \"section_1A\": \"\\\"Item 1A.\\\\nRisk Factors.\\\\nNot applicable.\\\\nItem 1B...\", \"section_1B\": \"\\\"Item 1B.\\\\nUnresolved Staff Comments.\\\\nNone.\\\\nItem...\", \"section_2\": \"\\\"Item 2.\\\\nProperties.\\\\nOur corporate office is loc...\", \"section_3\": \"\\\"Item 3.\\\\nLegal Proceedings.\\\\nWe are aware of the ...\", \"section_4\": \"\\\"Item 4.\\\\n(Removed and Reserved).\\\\nNot applicable....\", \"section_5\": \"\\\"Item 5.\\\\nMarket for Registrant\\\\u2019s Common Equi...\", \"section_6\": \"\\\"Item 6.\\\\nSelected Financial Data.\\\\nNot applicable...\", \"section_7\": \"\\\"Item 7.\\\\nManagement\\\\u2019s Discussion and Analysi...\", \"section_7A\": \"\\\"Item 7A.\\\\nQuantitative and Qualitative Disclosure...\", \"section_8\": \"\\\"Item 8.\\\\nFinancial Statements and Supplementary D...\", \"section_9\": \"\\\"Item 9.\\\\nChanges in and Disagreements with Accoun...\", \"section_9A\": \"\\\"Item 9A.\\\\nControls and Procedures.\\\\nSee Item 9A(T...\", \"section_9B\": \"\\\"Item 9B.\\\\nOther Information.\\\\nNone.\\\\nPART III\\\\nIt...\", \"section_10\": \"\\\"Item 10.\\\\nDirectors, Executive Officers and Corpo...\", \"section_11\": \"\\\"Item 11.\\\\nExecutive Compensation.\\\\nSummary of Cas...\", \"section_12\": \"\\\"Item 12.\\\\nSecurity Ownership of Certain Beneficia...\", \"section_13\": \"\\\"Item 13.\\\\nCertain Relationships and Related Trans...\", \"section_14\": \"\\\"Item 14.\\\\nPrincipal Accounting Fees and Services....\", \"section_15\": \"\\\"Item 15.\\\\nExhibits, Financial Statement Schedules...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2013": {"config_name": "year_2013", "sample_row": "{\"filename\": \"\\\"875657_2013.htm\\\"\", \"cik\": \"\\\"875657\\\"\", \"year\": \"\\\"2013\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nGeneral\\\\nWe offer products and ...\", \"section_1A\": \"\\\"ITEM 1A. RISK FACTORS\\\\nOur business faces many ri...\", \"section_1B\": \"\\\"ITEM 1B. UNRESOLVED STAFF COMMENTS\\\\nNone.\\\\nITEM 2...\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nAs of December 31, 2013, we o...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nWe are subject to lega...\", \"section_4\": \"\\\"ITEM 4. MINE SAFETY DISCLOSURES\\\\nNot applicable.\\\\...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT\\\\u2019S COMMON EQUIT...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nAs a smaller rep...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT\\\\u2019S DISCUSSION AND ANALYSIS...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nEvaluation Of D...\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION\\\\nNone.\\\\nPART III\\\\nThe ...\", \"section_10\": \"\\\"ITEM 10. DIRECTORS, EXECUTIVE OFFICERS AND CORPOR...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nThe sections ent...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTANT FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDULES\\\\...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2014": {"config_name": "year_2014", "sample_row": "{\"filename\": \"\\\"884887_2014.htm\\\"\", \"cik\": \"\\\"884887\\\"\", \"year\": \"\\\"2014\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nGeneral\\\\nWe are the world\\\\u201...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nThe risk factors set forth...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nInformation about our cruise ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nA class action complai...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures\\\\nNone.\\\\nPART II\\\\n...\", \"section_5\": \"\\\"Item 5. Market for Registrant's Common Equity, Re...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nThe selected con...\", \"section_7\": \"\\\"Item 7. Management's Discussion and Analysis of F...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes In and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nEvaluation of D...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"\\\"\", \"section_11\": \"\\\"\\\"\", \"section_12\": \"\\\"\\\"\", \"section_13\": \"\\\"\\\"\", \"section_14\": \"\\\"\\\"\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2015": {"config_name": "year_2015", "sample_row": "{\"filename\": \"\\\"874841_2015.htm\\\"\", \"cik\": \"\\\"874841\\\"\", \"year\": \"\\\"2015\\\"\", \"section_1\": \"\\\"ITEM 1. BUSINESS\\\\nGeneral\\\\nPacific Sunwear of Cal...\", \"section_1A\": \"\\\"ITEM 1A. RISK FACTORS\\\\nCautionary Note Regarding ...\", \"section_1B\": \"\\\"ITEM 1B. UNRESOLVED STAFF COMMENTS\\\\nNone.\\\\nITEM 2...\", \"section_2\": \"\\\"ITEM 2. PROPERTIES\\\\nWe operate stores in each of ...\", \"section_3\": \"\\\"ITEM 3. LEGAL PROCEEDINGS\\\\nCharles Pfeiffer, indi...\", \"section_4\": \"\\\"ITEM 4. MINE SAFETY DISCLOSURES\\\\nNot applicable.\\\\...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT\\\\u2019S COMMON EQUIT...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nThe following ta...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT\\\\u2019S DISCUSSION AND ANALYSIS...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nConclusion Rega...\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION\\\\nNone.\\\\nPART III\\\\nITEM...\", \"section_10\": \"\\\"ITEM 10. DIRECTORS, EXECUTIVE OFFICERS AND CORPOR...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nInformation with...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSA...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTING FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS AND FINANCIAL STATEMENT SCHEDUL...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2016": {"config_name": "year_2016", "sample_row": "{\"filename\": \"\\\"1306035_2016.htm\\\"\", \"cik\": \"\\\"1306035\\\"\", \"year\": \"\\\"2016\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nBackground Information\\\\nThe Com...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nBefore you invest in our c...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nThe Company currently maintai...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThe Company is not a p...\", \"section_4\": \"\\\"Item 4. Submission of Matters to a Vote of Securi...\", \"section_5\": \"\\\"ITEM 5. MARKET FOR REGISTRANT'S COMMON EQUITY, RE...\", \"section_6\": \"\\\"ITEM 6. SELECTED FINANCIAL DATA\\\\nNot Applicable.\\\\...\", \"section_7\": \"\\\"ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF F...\", \"section_7A\": \"\\\"ITEM 7A. QUANTITATIVE AND QUALITIATIVE DISCLOSURE...\", \"section_8\": \"\\\"ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DA...\", \"section_9\": \"\\\"ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNT...\", \"section_9A\": \"\\\"ITEM 9A. CONTROLS AND PROCEDURES\\\\nEvaluation of D...\", \"section_9B\": \"\\\"ITEM 9B. OTHER INFORMATION\\\\nNone.\\\\nREPORT OF INDE...\", \"section_10\": \"\\\"ITEM 10.01 DEPARTURE OF DIRECTORS OR PRINCIPAL OF...\", \"section_11\": \"\\\"ITEM 11. EXECUTIVE COMPENSATION\\\\nNo annual and lo...\", \"section_12\": \"\\\"ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL...\", \"section_13\": \"\\\"ITEM 13. CERTAIN RELATIONSHIPS AND RELATED PARTY ...\", \"section_14\": \"\\\"ITEM 14. PRINCIPAL ACCOUNTING FEES AND SERVICES\\\\n...\", \"section_15\": \"\\\"ITEM 15. EXHIBITS, FINANCIAL STATEMENT SCHEDULES\\\\...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2017": {"config_name": "year_2017", "sample_row": "{\"filename\": \"\\\"1595248_2017.htm\\\"\", \"cik\": \"\\\"1595248\\\"\", \"year\": \"\\\"2017\\\"\", \"section_1\": \"\\\"Item 1. Business.\\\\nOverview\\\\nGenprex\\\\u2122 is a c...\", \"section_1A\": \"\\\"Item 1A. Risk Factors.\\\\nInvesting in our common s...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments.\\\\nNone.\\\\nItem ...\", \"section_2\": \"\\\"Item 2. Properties.\\\\nOur corporate and executive ...\", \"section_3\": \"\\\"Item 3. Legal Proceedings.\\\\nWe are not subject to...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures.\\\\nNone.\\\\nPART II\\\\...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nThe following s...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures.\\\\nEvaluation of ...\", \"section_9B\": \"\\\"Item 9B. Other Information.\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation.\\\\nOur named execu...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services.\\\\...\", \"section_15\": \"\\\"Item 15. Exhibits, Financial Statement Schedules....\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2018": {"config_name": "year_2018", "sample_row": "{\"filename\": \"\\\"1566373_2018.htm\\\"\", \"cik\": \"\\\"1566373\\\"\", \"year\": \"\\\"2018\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nOverview\\\\nWe are a clinical-sta...\", \"section_1A\": \"\\\"Item 1A. Risk Factors.\\\\nInvesting in our common s...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments.\\\\nNot applicab...\", \"section_2\": \"\\\"Item 2. Properties.\\\\nOur current operations are b...\", \"section_3\": \"\\\"Item 3. Legal Proceedings.\\\\nFrom time to time, we...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures.\\\\nNot applicable....\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data.\\\\nThe following s...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements With Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures.\\\\nEvaluation of ...\", \"section_9B\": \"\\\"Item 9B. Other Information.\\\\nNone.\\\\nPART III\\\\nIte...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation.\\\\nThe response to...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services.\\\\...\", \"section_15\": \"\\\"Item 15. Exhibits, Financial Statement Schedules....\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2019": {"config_name": "year_2019", "sample_row": "{\"filename\": \"\\\"88121_2019.htm\\\"\", \"cik\": \"\\\"88121\\\"\", \"year\": \"\\\"2019\\\"\", \"section_1\": \"\\\"Item 1. Business\\\\nGeneral Development of Business...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nSeaboard has identified im...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNone.\\\\nItem 2...\", \"section_2\": \"\\\"Item 2. Properties\\\\nManagement believes that Seab...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThe information requir...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures\\\\nNot Applicable.\\\\...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\n(a)Total assets ...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nAs of December ...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone.\\\\nPART III\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe information ...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits, Financial Statement Schedules\\\\...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}, "year_2020": {"config_name": "year_2020", "sample_row": "{\"filename\": \"\\\"718413_2020.htm\\\"\", \"cik\": \"\\\"718413\\\"\", \"year\": \"\\\"2020\\\"\", \"section_1\": \"\\\"Item 1. The Business\\\\nOrganization and Operation\\\\...\", \"section_1A\": \"\\\"Item 1A. Risk Factors\\\\nBefore deciding to invest ...\", \"section_1B\": \"\\\"Item 1B. Unresolved Staff Comments\\\\nNot Applicabl...\", \"section_2\": \"\\\"Item 2. Properties\\\\nAlthough the Company does not...\", \"section_3\": \"\\\"Item 3. Legal Proceedings\\\\nThere are no pending l...\", \"section_4\": \"\\\"Item 4. Mine Safety Disclosures\\\\nNot Applicable\\\\n...\", \"section_5\": \"\\\"Item 5. Market for Registrant\\\\u2019s Common Equit...\", \"section_6\": \"\\\"Item 6. Selected Financial Data\\\\nOmitted, in acco...\", \"section_7\": \"\\\"Item 7. Management\\\\u2019s Discussion and Analysis...\", \"section_7A\": \"\\\"Item 7A. Quantitative and Qualitative Disclosures...\", \"section_8\": \"\\\"Item 8. Financial Statements and Supplementary Da...\", \"section_9\": \"\\\"Item 9. Changes in and Disagreements with Account...\", \"section_9A\": \"\\\"Item 9A. Controls and Procedures\\\\nDisclosure Cont...\", \"section_9B\": \"\\\"Item 9B. Other Information\\\\nNone\\\\nPART III.\\\\nItem...\", \"section_10\": \"\\\"Item 10. Directors, Executive Officers and Corpor...\", \"section_11\": \"\\\"Item 11. Executive Compensation\\\\nThe following is...\", \"section_12\": \"\\\"Item 12. Security Ownership of Certain Beneficial...\", \"section_13\": \"\\\"Item 13. Certain Relationships and Related Transa...\", \"section_14\": \"\\\"Item 14. Principal Accounting Fees and Services\\\\n...\", \"section_15\": \"\\\"Item 15. Exhibits and Financial Statement Schedul...\"}", "columns": ["filename", "cik", "year", "section_1", "section_1A", "section_1B", "section_2", "section_3", "section_4", "section_5", "section_6", "section_7", "section_7A", "section_8", "section_9", "section_9A", "section_9B", "section_10", "section_11", "section_12", "section_13", "section_14", "section_15"], "columns_mapping": {"filename": "filename", "cik": "cik", "year": "year", "section_1": "section_1", "section_1A": "section_1A", "section_1B": "section_1B", "section_2": "section_2", "section_3": "section_3", "section_4": "section_4", "section_5": "section_5", "section_6": "section_6", "section_7": "section_7", "section_7A": "section_7A", "section_8": "section_8", "section_9": "section_9", "section_9A": "section_9A", "section_9B": "section_9B", "section_10": "section_10", "section_11": "section_11", "section_12": "section_12", "section_13": "section_13", "section_14": "section_14", "section_15": "section_15"}, "dataset_description": "\nThe dataset contains annual filings (10K) of all publicly traded firms from 1993-2020. The table data is stripped but all text is retained.\nThis dataset allows easy access to the EDGAR-CORPUS dataset based on the paper EDGAR-CORPUS: Billions of Tokens Make The World Go Round (See References in README.md for details).\n", "dataset_name": "eloukas/edgar-corpus"}}, "tags": ["task_categories:other", "annotations_creators:no-annotation", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "research papers", "edgar", "sec", "finance", "financial", "filings", "10K", "10-K", "nlp", "research", "econlp", "economics", "business"], "is_gated": false}, "tasksource/babi_nli": {"dataset_name": "tasksource/babi_nli", "description": "bAbi tasks recasted as natural language inference.", "downloads": 139, "configs": {"single-supporting-fact": {"config_name": "single-supporting-fact", "sample_row": "{\"premise\": \"\\\"John travelled to the bathroom. Sandra moved to t...\", \"hypothesis\": \"\\\"Sandra is in the hallway.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "two-supporting-facts": {"config_name": "two-supporting-facts", "sample_row": "{\"premise\": \"\\\"Mary picked up the apple there. John took the mil...\", \"hypothesis\": \"\\\"The apple is in the garden.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "three-supporting-facts": {"config_name": "three-supporting-facts", "sample_row": "{\"premise\": \"\\\"Mary grabbed the apple. Mary discarded the apple....\", \"hypothesis\": \"\\\"The football before the bedroom was in the hallwa...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "two-arg-relations": {"config_name": "two-arg-relations", "sample_row": "{\"premise\": \"\\\"The kitchen is east of the office. The kitchen is...\", \"hypothesis\": \"\\\"The office west of is kitchen.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "three-arg-relations": {"config_name": "three-arg-relations", "sample_row": "{\"premise\": \"\\\"Bill picked up the milk there. Bill dropped the m...\", \"hypothesis\": \"\\\"Bill received the football.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "yes-no-questions": {"config_name": "yes-no-questions", "sample_row": "{\"premise\": \"\\\"Mary moved to the bathroom. Sandra journeyed to t...\", \"hypothesis\": \"\\\"Sandra is in the hallway.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "counting": {"config_name": "counting", "sample_row": "{\"premise\": \"\\\"Daniel moved to the bathroom. John moved to the k...\", \"hypothesis\": \"\\\"There is one objects is Sandra carrying.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "lists-sets": {"config_name": "lists-sets", "sample_row": "{\"premise\": \"\\\"Daniel grabbed the apple there. Daniel travelled ...\", \"hypothesis\": \"\\\"Daniel is carrying apple.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "simple-negation": {"config_name": "simple-negation", "sample_row": "{\"premise\": \"\\\"Mary is no longer in the bedroom. Daniel moved to...\", \"hypothesis\": \"\\\"Mary is in the bedroom.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "indefinite-knowledge": {"config_name": "indefinite-knowledge", "sample_row": "{\"premise\": \"\\\"Fred is either in the school or the park. Mary we...\", \"hypothesis\": \"\\\"Mary is in the office.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "basic-coreference": {"config_name": "basic-coreference", "sample_row": "{\"premise\": \"\\\"Mary travelled to the bedroom. Afterwards she jou...\", \"hypothesis\": \"\\\"Mary is in the bathroom.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "conjunction": {"config_name": "conjunction", "sample_row": "{\"premise\": \"\\\"Mary and John moved to the bedroom. Daniel and Jo...\", \"hypothesis\": \"\\\"John is in the garden.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "compound-coreference": {"config_name": "compound-coreference", "sample_row": "{\"premise\": \"\\\"John and Daniel went to the office. After that th...\", \"hypothesis\": \"\\\"John is in the kitchen.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "time-reasoning": {"config_name": "time-reasoning", "sample_row": "{\"premise\": \"\\\"Mary journeyed to the kitchen this morning. Mary ...\", \"hypothesis\": \"\\\"Mary before the kitchen was in the school.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "basic-deduction": {"config_name": "basic-deduction", "sample_row": "{\"premise\": \"\\\"Sheep are afraid of cats. Mice are afraid of cats...\", \"hypothesis\": \"\\\"Emily is afraid of cat.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "basic-induction": {"config_name": "basic-induction", "sample_row": "{\"premise\": \"\\\"Lily is a rhino. Lily is white. Bernhard is a swa...\", \"hypothesis\": \"\\\"Greg is yellow.\\\"\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "positional-reasoning": {"config_name": "positional-reasoning", "sample_row": "{\"premise\": \"\\\"The triangle is above the pink rectangle. The blu...\", \"hypothesis\": \"\\\"The pink rectangle is to the right of the blue sq...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "size-reasoning": {"config_name": "size-reasoning", "sample_row": "{\"premise\": \"\\\"The box of chocolates fits inside the chest. The ...\", \"hypothesis\": \"\\\"The box fit in the box of chocolates.\\\"\", \"label\": \"0\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "path-finding": {"config_name": "path-finding", "sample_row": "{\"premise\": \"\\\"The garden is west of the office. The bedroom is ...\", \"hypothesis\": \"\\\"You go from the kitchen to the garden by heading ...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}, "agents-motivations": {"config_name": "agents-motivations", "sample_row": "{\"premise\": \"\\\"Sumit is bored. Jason is bored. Yann is thirsty. ...\", \"hypothesis\": \"\\\"Sumit went to the garden because she was bored.\\\"...\", \"label\": \"1\", \"idx\": \"0\"}", "columns": ["premise", "hypothesis", "label", "idx"], "columns_mapping": {"premise": "premise", "hypothesis": "hypothesis", "label": "label", "idx": "idx"}, "dataset_description": "bAbi tasks recasted as natural language inference.\n", "dataset_name": "tasksource/babi_nli"}}, "tags": ["task_categories:text-classification", "task_ids:natural-language-inference", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:en", "logical reasoning", "nli", "natural-language-inference", "reasoning", "logic"], "is_gated": false}, "JanosAudran/financial-reports-sec": {"dataset_name": "JanosAudran/financial-reports-sec", "description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.", "downloads": 227, "configs": {"large_lite": {"config_name": "large_lite", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "docID", "sentenceID", "sentenceCount"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}, "large_full": {"config_name": "large_full", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"name\": \"\\\"AAR CORP\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\", \"tickers\": \"[\\\"AIR\\\"]\", \"exchanges\": \"[\\\"NYSE\\\"]\", \"entityType\": \"\\\"operating\\\"\", \"sic\": \"\\\"3720\\\"\", \"stateOfIncorporation\": \"\\\"DE\\\"\", \"tickerCount\": \"1\", \"acceptanceDateTime\": \"\\\"2020-07-21T17:19:15.000Z\\\"\", \"form\": \"\\\"10-K\\\"\", \"reportDate\": \"\\\"2020-05-31\\\"\", \"returns.1d.closePriceEndDate\": \"19.0100002289\", \"returns.1d.closePriceStartDate\": \"18.1900005341\", \"returns.1d.endDate\": \"\\\"2020-07-22T00:00:00-04:00\\\"\", \"returns.1d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.1d.ret\": \"0.045079696\", \"returns.5d.closePriceEndDate\": \"17.7199993134\", \"returns.5d.closePriceStartDate\": \"18.1900005341\", \"returns.5d.endDate\": \"\\\"2020-07-27T00:00:00-04:00\\\"\", \"returns.5d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.5d.ret\": \"-0.0258384391\", \"returns.30d.closePriceEndDate\": \"19.25\", \"returns.30d.closePriceStartDate\": \"18.1900005341\", \"returns.30d.endDate\": \"\\\"2020-08-20T00:00:00-04:00\\\"\", \"returns.30d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.30d.ret\": \"0.0582737457\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "dataset_name", "docID", "sentenceID", "sentenceCount", "tickers", "exchanges", "entityType", "sic", "stateOfIncorporation", "tickerCount", "acceptanceDateTime", "form", "reportDate", "returns_1d_closePriceEndDate", "returns_1d_closePriceStartDate", "returns_1d_endDate", "returns_1d_startDate", "returns_1d_ret", "returns_5d_closePriceEndDate", "returns_5d_closePriceStartDate", "returns_5d_endDate", "returns_5d_startDate", "returns_5d_ret", "returns_30d_closePriceEndDate", "returns_30d_closePriceStartDate", "returns_30d_endDate", "returns_30d_startDate", "returns_30d_ret"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "dataset_name": "dataset_name", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount", "tickers": "tickers", "exchanges": "exchanges", "entityType": "entityType", "sic": "sic", "stateOfIncorporation": "stateOfIncorporation", "tickerCount": "tickerCount", "acceptanceDateTime": "acceptanceDateTime", "form": "form", "reportDate": "reportDate", "returns.1d.closePriceEndDate": "returns_1d_closePriceEndDate", "returns.1d.closePriceStartDate": "returns_1d_closePriceStartDate", "returns.1d.endDate": "returns_1d_endDate", "returns.1d.startDate": "returns_1d_startDate", "returns.1d.ret": "returns_1d_ret", "returns.5d.closePriceEndDate": "returns_5d_closePriceEndDate", "returns.5d.closePriceStartDate": "returns_5d_closePriceStartDate", "returns.5d.endDate": "returns_5d_endDate", "returns.5d.startDate": "returns_5d_startDate", "returns.5d.ret": "returns_5d_ret", "returns.30d.closePriceEndDate": "returns_30d_closePriceEndDate", "returns.30d.closePriceStartDate": "returns_30d_closePriceStartDate", "returns.30d.endDate": "returns_30d_endDate", "returns.30d.startDate": "returns_30d_startDate", "returns.30d.ret": "returns_30d_ret"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}, "small_lite": {"config_name": "small_lite", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "docID", "sentenceID", "sentenceCount"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}, "small_full": {"config_name": "small_full", "sample_row": "{\"cik\": \"\\\"0000001750\\\"\", \"sentence\": \"\\\"ITEM 1.BUSINESS General AAR CORP. and its subsidi...\", \"section\": \"0\", \"labels.1d\": \"0\", \"labels.5d\": \"1\", \"labels.30d\": \"0\", \"filingDate\": \"\\\"2020-07-21\\\"\", \"name\": \"\\\"AAR CORP\\\"\", \"docID\": \"\\\"0000001750_10-K_2020\\\"\", \"sentenceID\": \"\\\"0000001750_10-K_2020_section_1_0\\\"\", \"sentenceCount\": \"1\", \"tickers\": \"[\\\"AIR\\\"]\", \"exchanges\": \"[\\\"NYSE\\\"]\", \"entityType\": \"\\\"operating\\\"\", \"sic\": \"\\\"3720\\\"\", \"stateOfIncorporation\": \"\\\"DE\\\"\", \"tickerCount\": \"1\", \"acceptanceDateTime\": \"\\\"2020-07-21T17:19:15.000Z\\\"\", \"form\": \"\\\"10-K\\\"\", \"reportDate\": \"\\\"2020-05-31\\\"\", \"returns.1d.closePriceEndDate\": \"19.0100002289\", \"returns.1d.closePriceStartDate\": \"18.1900005341\", \"returns.1d.endDate\": \"\\\"2020-07-22T00:00:00-04:00\\\"\", \"returns.1d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.1d.ret\": \"0.045079696\", \"returns.5d.closePriceEndDate\": \"17.7199993134\", \"returns.5d.closePriceStartDate\": \"18.1900005341\", \"returns.5d.endDate\": \"\\\"2020-07-27T00:00:00-04:00\\\"\", \"returns.5d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.5d.ret\": \"-0.0258384391\", \"returns.30d.closePriceEndDate\": \"19.25\", \"returns.30d.closePriceStartDate\": \"18.1900005341\", \"returns.30d.endDate\": \"\\\"2020-08-20T00:00:00-04:00\\\"\", \"returns.30d.startDate\": \"\\\"2020-07-20T00:00:00-04:00\\\"\", \"returns.30d.ret\": \"0.0582737457\"}", "columns": ["cik", "sentence", "section", "labels_1d", "labels_5d", "labels_30d", "filingDate", "dataset_name", "docID", "sentenceID", "sentenceCount", "tickers", "exchanges", "entityType", "sic", "stateOfIncorporation", "tickerCount", "acceptanceDateTime", "form", "reportDate", "returns_1d_closePriceEndDate", "returns_1d_closePriceStartDate", "returns_1d_endDate", "returns_1d_startDate", "returns_1d_ret", "returns_5d_closePriceEndDate", "returns_5d_closePriceStartDate", "returns_5d_endDate", "returns_5d_startDate", "returns_5d_ret", "returns_30d_closePriceEndDate", "returns_30d_closePriceStartDate", "returns_30d_endDate", "returns_30d_startDate", "returns_30d_ret"], "columns_mapping": {"cik": "cik", "sentence": "sentence", "section": "section", "labels.1d": "labels_1d", "labels.5d": "labels_5d", "labels.30d": "labels_30d", "filingDate": "filingDate", "dataset_name": "dataset_name", "docID": "docID", "sentenceID": "sentenceID", "sentenceCount": "sentenceCount", "tickers": "tickers", "exchanges": "exchanges", "entityType": "entityType", "sic": "sic", "stateOfIncorporation": "stateOfIncorporation", "tickerCount": "tickerCount", "acceptanceDateTime": "acceptanceDateTime", "form": "form", "reportDate": "reportDate", "returns.1d.closePriceEndDate": "returns_1d_closePriceEndDate", "returns.1d.closePriceStartDate": "returns_1d_closePriceStartDate", "returns.1d.endDate": "returns_1d_endDate", "returns.1d.startDate": "returns_1d_startDate", "returns.1d.ret": "returns_1d_ret", "returns.5d.closePriceEndDate": "returns_5d_closePriceEndDate", "returns.5d.closePriceStartDate": "returns_5d_closePriceStartDate", "returns.5d.endDate": "returns_5d_endDate", "returns.5d.startDate": "returns_5d_startDate", "returns.5d.ret": "returns_5d_ret", "returns.30d.closePriceEndDate": "returns_30d_closePriceEndDate", "returns.30d.closePriceStartDate": "returns_30d_closePriceStartDate", "returns.30d.endDate": "returns_30d_endDate", "returns.30d.startDate": "returns_30d_startDate", "returns.30d.ret": "returns_30d_ret"}, "dataset_description": "The dataset contains the annual report of US public firms filing with the SEC EDGAR system.\nEach annual report (10K filing) is broken into 20 sections. Each section is split into individual sentences.\nSentiment labels are provided on a per filing basis from the market reaction around the filing data.\nAdditional metadata for each filing is included in the dataset.\n", "dataset_name": "JanosAudran/financial-reports-sec"}}, "tags": ["task_categories:fill-mask", "task_categories:text-classification", "task_ids:masked-language-modeling", "task_ids:multi-class-classification", "task_ids:sentiment-classification", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "'finance", "financial", "10-K", "10K", "10k", "10-k", "annual", "reports", "sec", "edgar", "sentiment", "firm", "public", "us'"], "is_gated": false}, "bigbio/drugprot": {"dataset_name": "bigbio/drugprot", "description": "The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships between them corresponding to a specific set of biologically relevant relation types.", "downloads": 77, "configs": {"drugprot_source": {"config_name": "drugprot_source", "sample_row": "{\"document_id\": \"\\\"17512723\\\"\", \"title\": \"\\\"RDH12, a retinol dehydrogenase causing Leber's co...\", \"abstract\": \"\\\"Three retinol dehydrogenases (RDHs) were tested f...\", \"text\": \"\\\"RDH12, a retinol dehydrogenase causing Leber's co...\", \"entities\": \"[{\\\"id\\\": \\\"17512723_T1\\\", \\\"type\\\": \\\"CHEMICAL\\\", \\\"text\\\":...\", \"relations\": \"[{\\\"id\\\": \\\"17512723_0\\\", \\\"type\\\": \\\"PRODUCT-OF\\\", \\\"arg1_...\"}", "columns": ["document_id", "title", "abstract", "text", "entities", "relations"], "columns_mapping": {"document_id": "document_id", "title": "title", "abstract": "abstract", "text": "text", "entities": "entities", "relations": "relations"}, "dataset_description": "The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships between them corresponding to a specific set of biologically relevant relation types.\n", "dataset_name": "bigbio/drugprot"}, "drugprot_bigbio_kb": {"config_name": "drugprot_bigbio_kb", "sample_row": "{\"id\": \"\\\"17512723\\\"\", \"document_id\": \"\\\"17512723\\\"\", \"passages\": \"[{\\\"id\\\": \\\"17512723_title\\\", \\\"type\\\": \\\"title\\\", \\\"text\\\":...\", \"entities\": \"[{\\\"id\\\": \\\"17512723_T1\\\", \\\"type\\\": \\\"CHEMICAL\\\", \\\"text\\\":...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"17512723_0\\\", \\\"type\\\": \\\"PRODUCT-OF\\\", \\\"arg1_...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The DrugProt corpus consists of a) expert-labelled chemical and gene mentions, and (b) all binary relationships between them corresponding to a specific set of biologically relevant relation types.\n", "dataset_name": "bigbio/drugprot"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "bigbio/cpi": {"dataset_name": "bigbio/cpi", "description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships", "downloads": 14, "configs": {"cpi_source": {"config_name": "cpi_source", "sample_row": "{\"document_id\": \"\\\"DS.d0\\\"\", \"document_orig_id\": \"\\\"17003041\\\"\", \"sentences\": \"[{\\\"sentence_id\\\": \\\"DS.d0.s0\\\", \\\"sentence_orig_id\\\": \\\"...\"}", "columns": ["document_id", "document_orig_id", "sentences"], "columns_mapping": {"document_id": "document_id", "document_orig_id": "document_orig_id", "sentences": "sentences"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}, "cpi_iv_source": {"config_name": "cpi_iv_source", "sample_row": "{\"document_id\": \"\\\"DS.d0\\\"\", \"document_orig_id\": \"\\\"17003041\\\"\", \"sentences\": \"[{\\\"sentence_id\\\": \\\"DS.d0.s0\\\", \\\"sentence_orig_id\\\": \\\"...\"}", "columns": ["document_id", "document_orig_id", "sentences"], "columns_mapping": {"document_id": "document_id", "document_orig_id": "document_orig_id", "sentences": "sentences"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}, "cpi_niv_source": {"config_name": "cpi_niv_source", "sample_row": "{\"document_id\": \"\\\"DS.d0\\\"\", \"document_orig_id\": \"\\\"17003041\\\"\", \"sentences\": \"[{\\\"sentence_id\\\": \\\"DS.d0.s0\\\", \\\"sentence_orig_id\\\": \\\"...\"}", "columns": ["document_id", "document_orig_id", "sentences"], "columns_mapping": {"document_id": "document_id", "document_orig_id": "document_orig_id", "sentences": "sentences"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}, "cpi_bigbio_kb": {"config_name": "cpi_bigbio_kb", "sample_row": "{\"id\": \"\\\"DS.d0\\\"\", \"document_id\": \"\\\"17003041\\\"\", \"passages\": \"[{\\\"id\\\": \\\"DS.d0.s0\\\", \\\"text\\\": [\\\"Bestrophin-1 enables...\", \"entities\": \"[{\\\"id\\\": \\\"DS.d0.s0.e0\\\", \\\"type\\\": \\\"protein\\\", \\\"text\\\": ...\", \"events\": \"[]\", \"coreferences\": \"[]\", \"relations\": \"[{\\\"id\\\": \\\"DS.d0.s0.i0\\\", \\\"type\\\": \\\"compound-protein-i...\"}", "columns": ["id", "document_id", "passages", "entities", "events", "coreferences", "relations"], "columns_mapping": {"id": "id", "document_id": "document_id", "passages": "passages", "entities": "entities", "events": "events", "coreferences": "coreferences", "relations": "relations"}, "dataset_description": "The compound-protein relationship (CPI) dataset consists of 2,613 sentences from abstracts containing annotations of proteins, small molecules, and their relationships\n", "dataset_name": "bigbio/cpi"}}, "tags": ["multilinguality:monolingual", "language:en"], "is_gated": false}, "DFKI-SLT/kbp37": {"dataset_name": "DFKI-SLT/kbp37", "description": "KBP37 is a revision of MIML-RE annotation dataset, provided by Gabor Angeli et al. (2014). They use both the 2010 and \n2013 KBP official document collections, as well as a July 2013 dump of Wikipedia as the text corpus for annotation. \nThere are 33811 sentences been annotated. Zhang and Wang made several refinements:\n1. They add direction to the relation names, e.g. '`per:employee_of`' is split into '`per:employee of(e1,e2)`'\nand '`per:employee of(e2,e1)`'. They also replace '`org:parents`' with '`org:subsidiaries`' and replace\n'`org:member of\u2019 with '`org:member`' (by their reverse directions).\n2. They discard low frequency relations such that both directions of each relation occur more than 100 times in the \ndataset.\n\nKBP37 contains 18 directional relations and an additional '`no_relation`' relation, resulting in 37 relation classes.", "downloads": 74, "configs": {"kbp37": {"config_name": "kbp37", "sample_row": "{\"id\": \"\\\"0\\\"\", \"sentence\": \"\\\" Thom Yorke of Radiohead has...\", \"relation\": \"27\"}", "columns": ["id", "sentence", "relation"], "columns_mapping": {"id": "id", "sentence": "sentence", "relation": "relation"}, "dataset_description": "KBP37 is a revision of MIML-RE annotation dataset, provided by Gabor Angeli et al. (2014). They use both the 2010 and \n2013 KBP official document collections, as well as a July 2013 dump of Wikipedia as the text corpus for annotation. \nThere are 33811 sentences been annotated. Zhang and Wang made several refinements:\n1. They add direction to the relation names, e.g. '`per:employee_of`' is split into '`per:employee of(e1,e2)`'\nand '`per:employee of(e2,e1)`'. They also replace '`org:parents`' with '`org:subsidiaries`' and replace\n'`org:member of\u2019 with '`org:member`' (by their reverse directions).\n2. They discard low frequency relations such that both directions of each relation occur more than 100 times in the \ndataset.\n\nKBP37 contains 18 directional relations and an additional '`no_relation`' relation, resulting in 37 relation classes.\n", "dataset_name": "DFKI-SLT/kbp37"}, "kbp37_formatted": {"config_name": "kbp37_formatted", "sample_row": "{\"id\": \"\\\"0\\\"\", \"token\": \"[\\\"Thom\\\", \\\"Yorke\\\", \\\"of\\\", \\\"Radiohead\\\", \\\"has\\\", \\\"inclu...\", \"e1_start\": \"0\", \"e1_end\": \"2\", \"e2_start\": \"3\", \"e2_end\": \"4\", \"relation\": \"27\"}", "columns": ["id", "token", "e1_start", "e1_end", "e2_start", "e2_end", "relation"], "columns_mapping": {"id": "id", "token": "token", "e1_start": "e1_start", "e1_end": "e1_end", "e2_start": "e2_start", "e2_end": "e2_end", "relation": "relation"}, "dataset_description": "KBP37 is a revision of MIML-RE annotation dataset, provided by Gabor Angeli et al. (2014). They use both the 2010 and \n2013 KBP official document collections, as well as a July 2013 dump of Wikipedia as the text corpus for annotation. \nThere are 33811 sentences been annotated. Zhang and Wang made several refinements:\n1. They add direction to the relation names, e.g. '`per:employee_of`' is split into '`per:employee of(e1,e2)`'\nand '`per:employee of(e2,e1)`'. They also replace '`org:parents`' with '`org:subsidiaries`' and replace\n'`org:member of\u2019 with '`org:member`' (by their reverse directions).\n2. They discard low frequency relations such that both directions of each relation occur more than 100 times in the \ndataset.\n\nKBP37 contains 18 directional relations and an additional '`no_relation`' relation, resulting in 37 relation classes.\n", "dataset_name": "DFKI-SLT/kbp37"}}, "tags": ["task_categories:text-classification", "task_ids:multi-class-classification", "annotations_creators:other", "multilinguality:monolingual", "source_datasets:extended|other", "language:en", "relation extraction"], "is_gated": false}, "metaeval/utilitarianism": {"dataset_name": "metaeval/utilitarianism", "description": "\"\"\"\n_HOMEPAGE = \"\"\n_LICENSE = \"Creative Commons Attribution-NonCommercial 4.0 International Public License\"\n\n# The HuggingFace dataset library don't host the datasets but only point to the original files\n# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)\n_URLs = {\"default\": \"https://www.dropbox.com/s/041prrjylv0tf0h/ethics.zip?dl=1\"}\n\n\nclass Imppres(datasets.GeneratorBasedBuilder):\n\n VERSION = datasets.Version(\"1.1.0\")\n\n def _info(self):\n features = datasets.Features(\n {\n \"better_choice\": datasets.Value(\"string\"),\n \"worst_choice\": datasets.Value(\"string\"),\n \"comparison\": datasets.Value(\"string\"),\n \"label\": datasets.Value(\"int32\"),\n })\n return datasets.DatasetInfo(\n # This is the description that will appear on the datasets page.\n description=_DESCRIPTION,\n # This defines the different columns of the dataset and their types\n features=features, # Here we define them above because they are different between the two configurations\n # If there's a common (input, target) tuple from the features,\n # specify them here. They'll be used if as_supervised=True in\n # builder.as_dataset.\n supervised_keys=None,\n # Homepage of the dataset for documentation\n homepage=_HOMEPAGE,\n # License for the dataset if available\n license=_LICENSE,\n # Citation for the dataset\n citation=_CITATION,\n )\n\n def _split_generators(self, dl_manager):", "downloads": 11, "configs": {"default": {"config_name": "default", "sample_row": "{\"better_choice\": \"\\\"I built a sandcastle with my nephew. We made one ...\", \"worst_choice\": \"\\\"I built a sandcastle with my nephew\\\"\", \"comparison\": \"\\\"\\\\\\\"I built a sandcastle with my nephew. We made on...\", \"label\": \"1\"}", "columns": ["better_choice", "worst_choice", "comparison", "label"], "columns_mapping": {"better_choice": "better_choice", "worst_choice": "worst_choice", "comparison": "comparison", "label": "label"}, "dataset_description": "", "dataset_name": "metaeval/utilitarianism"}}, "tags": [], "is_gated": false}, "shunk031/wrime": {"dataset_name": "shunk031/wrime", "description": "WRIME dataset is a new dataset for emotional intensity estimation with subjective and objective annotations.", "downloads": 1084, "configs": {"ver1": {"config_name": "ver1", "sample_row": "{\"sentence\": \"\\\"\\\\u307c\\\\u3051\\\\u3063\\\\u3068\\\\u3057\\\\u3066\\\\u305f\\\\u3089\\\\...\", \"user_id\": \"\\\"1\\\"\", \"datetime\": \"\\\"2012/07/31 23:48\\\"\", \"writer.joy\": \"0\", \"writer.sadness\": \"1\", \"writer.anticipation\": \"2\", \"writer.surprise\": \"1\", \"writer.anger\": \"1\", \"writer.fear\": \"0\", \"writer.disgust\": \"0\", \"writer.trust\": \"1\", \"reader1.joy\": \"0\", \"reader1.sadness\": \"2\", \"reader1.anticipation\": \"0\", \"reader1.surprise\": \"0\", \"reader1.anger\": \"0\", \"reader1.fear\": \"0\", \"reader1.disgust\": \"0\", \"reader1.trust\": \"0\", \"reader2.joy\": \"0\", \"reader2.sadness\": \"2\", \"reader2.anticipation\": \"0\", \"reader2.surprise\": \"1\", \"reader2.anger\": \"0\", \"reader2.fear\": \"0\", \"reader2.disgust\": \"0\", \"reader2.trust\": \"0\", \"reader3.joy\": \"0\", \"reader3.sadness\": \"2\", \"reader3.anticipation\": \"0\", \"reader3.surprise\": \"0\", \"reader3.anger\": \"0\", \"reader3.fear\": \"1\", \"reader3.disgust\": \"1\", \"reader3.trust\": \"0\", \"avg_readers.joy\": \"0\", \"avg_readers.sadness\": \"2\", \"avg_readers.anticipation\": \"0\", \"avg_readers.surprise\": \"0\", \"avg_readers.anger\": \"0\", \"avg_readers.fear\": \"0\", \"avg_readers.disgust\": \"0\", \"avg_readers.trust\": \"0\"}", "columns": ["sentence", "user_id", "datetime", "writer_joy", "writer_sadness", "writer_anticipation", "writer_surprise", "writer_anger", "writer_fear", "writer_disgust", "writer_trust", "reader1_joy", "reader1_sadness", "reader1_anticipation", "reader1_surprise", "reader1_anger", "reader1_fear", "reader1_disgust", "reader1_trust", "reader2_joy", "reader2_sadness", "reader2_anticipation", "reader2_surprise", "reader2_anger", "reader2_fear", "reader2_disgust", "reader2_trust", "reader3_joy", "reader3_sadness", "reader3_anticipation", "reader3_surprise", "reader3_anger", "reader3_fear", "reader3_disgust", "reader3_trust", "avg_readers_joy", "avg_readers_sadness", "avg_readers_anticipation", "avg_readers_surprise", "avg_readers_anger", "avg_readers_fear", "avg_readers_disgust", "avg_readers_trust"], "columns_mapping": {"sentence": "sentence", "user_id": "user_id", "datetime": "datetime", "writer.joy": "writer_joy", "writer.sadness": "writer_sadness", "writer.anticipation": "writer_anticipation", "writer.surprise": "writer_surprise", "writer.anger": "writer_anger", "writer.fear": "writer_fear", "writer.disgust": "writer_disgust", "writer.trust": "writer_trust", "reader1.joy": "reader1_joy", "reader1.sadness": "reader1_sadness", "reader1.anticipation": "reader1_anticipation", "reader1.surprise": "reader1_surprise", "reader1.anger": "reader1_anger", "reader1.fear": "reader1_fear", "reader1.disgust": "reader1_disgust", "reader1.trust": "reader1_trust", "reader2.joy": "reader2_joy", "reader2.sadness": "reader2_sadness", "reader2.anticipation": "reader2_anticipation", "reader2.surprise": "reader2_surprise", "reader2.anger": "reader2_anger", "reader2.fear": "reader2_fear", "reader2.disgust": "reader2_disgust", "reader2.trust": "reader2_trust", "reader3.joy": "reader3_joy", "reader3.sadness": "reader3_sadness", "reader3.anticipation": "reader3_anticipation", "reader3.surprise": "reader3_surprise", "reader3.anger": "reader3_anger", "reader3.fear": "reader3_fear", "reader3.disgust": "reader3_disgust", "reader3.trust": "reader3_trust", "avg_readers.joy": "avg_readers_joy", "avg_readers.sadness": "avg_readers_sadness", "avg_readers.anticipation": "avg_readers_anticipation", "avg_readers.surprise": "avg_readers_surprise", "avg_readers.anger": "avg_readers_anger", "avg_readers.fear": "avg_readers_fear", "avg_readers.disgust": "avg_readers_disgust", "avg_readers.trust": "avg_readers_trust"}, "dataset_description": "WRIME dataset is a new dataset for emotional intensity estimation with subjective and objective annotations.\n", "dataset_name": "shunk031/wrime"}, "ver2": {"config_name": "ver2", "sample_row": "{\"sentence\": \"\\\"\\\\u307c\\\\u3051\\\\u3063\\\\u3068\\\\u3057\\\\u3066\\\\u305f\\\\u3089\\\\...\", \"user_id\": \"\\\"1\\\"\", \"datetime\": \"\\\"2012/7/31 23:48\\\"\", \"writer.joy\": \"0\", \"writer.sadness\": \"1\", \"writer.anticipation\": \"2\", \"writer.surprise\": \"1\", \"writer.anger\": \"1\", \"writer.fear\": \"0\", \"writer.disgust\": \"0\", \"writer.trust\": \"1\", \"writer.sentiment\": \"0\", \"reader1.joy\": \"0\", \"reader1.sadness\": \"2\", \"reader1.anticipation\": \"0\", \"reader1.surprise\": \"0\", \"reader1.anger\": \"0\", \"reader1.fear\": \"0\", \"reader1.disgust\": \"0\", \"reader1.trust\": \"0\", \"reader1.sentiment\": \"-2\", \"reader2.joy\": \"0\", \"reader2.sadness\": \"2\", \"reader2.anticipation\": \"0\", \"reader2.surprise\": \"0\", \"reader2.anger\": \"0\", \"reader2.fear\": \"1\", \"reader2.disgust\": \"1\", \"reader2.trust\": \"0\", \"reader2.sentiment\": \"-1\", \"reader3.joy\": \"0\", \"reader3.sadness\": \"2\", \"reader3.anticipation\": \"0\", \"reader3.surprise\": \"1\", \"reader3.anger\": \"0\", \"reader3.fear\": \"0\", \"reader3.disgust\": \"0\", \"reader3.trust\": \"0\", \"reader3.sentiment\": \"-1\", \"avg_readers.joy\": \"0\", \"avg_readers.sadness\": \"2\", \"avg_readers.anticipation\": \"0\", \"avg_readers.surprise\": \"0\", \"avg_readers.anger\": \"0\", \"avg_readers.fear\": \"0\", \"avg_readers.disgust\": \"0\", \"avg_readers.trust\": \"0\", \"avg_readers.sentiment\": \"-1\"}", "columns": ["sentence", "user_id", "datetime", "writer_joy", "writer_sadness", "writer_anticipation", "writer_surprise", "writer_anger", "writer_fear", "writer_disgust", "writer_trust", "writer_sentiment", "reader1_joy", "reader1_sadness", "reader1_anticipation", "reader1_surprise", "reader1_anger", "reader1_fear", "reader1_disgust", "reader1_trust", "reader1_sentiment", "reader2_joy", "reader2_sadness", "reader2_anticipation", "reader2_surprise", "reader2_anger", "reader2_fear", "reader2_disgust", "reader2_trust", "reader2_sentiment", "reader3_joy", "reader3_sadness", "reader3_anticipation", "reader3_surprise", "reader3_anger", "reader3_fear", "reader3_disgust", "reader3_trust", "reader3_sentiment", "avg_readers_joy", "avg_readers_sadness", "avg_readers_anticipation", "avg_readers_surprise", "avg_readers_anger", "avg_readers_fear", "avg_readers_disgust", "avg_readers_trust", "avg_readers_sentiment"], "columns_mapping": {"sentence": "sentence", "user_id": "user_id", "datetime": "datetime", "writer.joy": "writer_joy", "writer.sadness": "writer_sadness", "writer.anticipation": "writer_anticipation", "writer.surprise": "writer_surprise", "writer.anger": "writer_anger", "writer.fear": "writer_fear", "writer.disgust": "writer_disgust", "writer.trust": "writer_trust", "writer.sentiment": "writer_sentiment", "reader1.joy": "reader1_joy", "reader1.sadness": "reader1_sadness", "reader1.anticipation": "reader1_anticipation", "reader1.surprise": "reader1_surprise", "reader1.anger": "reader1_anger", "reader1.fear": "reader1_fear", "reader1.disgust": "reader1_disgust", "reader1.trust": "reader1_trust", "reader1.sentiment": "reader1_sentiment", "reader2.joy": "reader2_joy", "reader2.sadness": "reader2_sadness", "reader2.anticipation": "reader2_anticipation", "reader2.surprise": "reader2_surprise", "reader2.anger": "reader2_anger", "reader2.fear": "reader2_fear", "reader2.disgust": "reader2_disgust", "reader2.trust": "reader2_trust", "reader2.sentiment": "reader2_sentiment", "reader3.joy": "reader3_joy", "reader3.sadness": "reader3_sadness", "reader3.anticipation": "reader3_anticipation", "reader3.surprise": "reader3_surprise", "reader3.anger": "reader3_anger", "reader3.fear": "reader3_fear", "reader3.disgust": "reader3_disgust", "reader3.trust": "reader3_trust", "reader3.sentiment": "reader3_sentiment", "avg_readers.joy": "avg_readers_joy", "avg_readers.sadness": "avg_readers_sadness", "avg_readers.anticipation": "avg_readers_anticipation", "avg_readers.surprise": "avg_readers_surprise", "avg_readers.anger": "avg_readers_anger", "avg_readers.fear": "avg_readers_fear", "avg_readers.disgust": "avg_readers_disgust", "avg_readers.trust": "avg_readers_trust", "avg_readers.sentiment": "avg_readers_sentiment"}, "dataset_description": "WRIME dataset is a new dataset for emotional intensity estimation with subjective and objective annotations.\n", "dataset_name": "shunk031/wrime"}}, "tags": ["task_categories:text-classification", "task_ids:sentiment-classification", "annotations_creators:crowdsourced", "multilinguality:monolingual", "language:ja", "sentiment-analysis", "wrime"], "is_gated": false}, "lucasmccabe/logiqa": {"dataset_name": "lucasmccabe/logiqa", "description": "LogiQA is constructed from the logical comprehension problems from publically available questions of the National Civil Servants Examination of China, which are designed to test the civil servant candidates\u2019 critical thinking and problem solving. This dataset includes the English versions only; the Chinese versions are available via the homepage/original source.", "downloads": 560, "configs": {"default": {"config_name": "default", "sample_row": "{\"context\": \"\\\"Some Cantonese don't like chili, so some southern...\", \"query\": \"\\\"Which of the following can guarantee the above ar...\", \"options\": \"[\\\"Some Cantonese love chili.\\\", \\\"Some people who li...\", \"correct_option\": \"2\"}", "columns": ["context", "query", "options", "correct_option"], "columns_mapping": {"context": "context", "query": "query", "options": "options", "correct_option": "correct_option"}, "dataset_description": "LogiQA is constructed from the logical comprehension problems from publically available questions of the National Civil Servants Examination of China, which are designed to test the civil servant candidates\u2019 critical thinking and problem solving. This dataset includes the English versions only; the Chinese versions are available via the homepage/original source.", "dataset_name": "lucasmccabe/logiqa"}}, "tags": ["task_categories:question-answering", "language:en"], "is_gated": false}, "nlp-thedeep/humset": {"dataset_name": "nlp-thedeep/humset", "description": "HumSet is a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. HumSet is curated by humanitarian analysts and covers various disasters around the globe that occurred from 2018 to 2021 in 46 humanitarian response projects. The dataset consists of approximately 17K annotated documents in three languages of English, French, and Spanish, originally taken from publicly-available resources. For each document, analysts have identified informative snippets (entries) in respect to common humanitarian frameworks, and assigned one or many classes to each entry. See the our paper for details.", "downloads": 246, "configs": {"1.0.0": {"config_name": "1.0.0", "sample_row": "{\"entry_id\": \"\\\"244334\\\"\", \"lead_id\": \"\\\"47982\\\"\", \"project_id\": \"\\\"2225\\\"\", \"lang\": \"\\\"fr\\\"\", \"n_tokens\": \"84\", \"project_title\": \"\\\"IMMAP/DFS RDC\\\"\", \"created_at\": \"\\\"2020-10-05 03:44:04.532391+00\\\"\", \"document\": \"\\\"https://www.radiookapi.net/2021/01/27/actualite/s...\", \"excerpt\": \"\\\"Le groupe Ma\\\\u00ef-Ma\\\\u00ef L\\\\u00e9opard actif da...\", \"sectors\": \"[]\", \"pillars_1d\": \"[\\\"Context\\\"]\", \"pillars_2d\": \"[]\", \"subpillars_1d\": \"[\\\"Context->Politics\\\", \\\"Context->Security & Stabili...\", \"subpillars_2d\": \"[]\"}", "columns": ["entry_id", "lead_id", "project_id", "lang", "n_tokens", "project_title", "created_at", "document", "excerpt", "sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d"], "columns_mapping": {"entry_id": "entry_id", "lead_id": "lead_id", "project_id": "project_id", "lang": "lang", "n_tokens": "n_tokens", "project_title": "project_title", "created_at": "created_at", "document": "document", "excerpt": "excerpt", "sectors": "sectors", "pillars_1d": "pillars_1d", "pillars_2d": "pillars_2d", "subpillars_1d": "subpillars_1d", "subpillars_2d": "subpillars_2d"}, "dataset_description": "HumSet is a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. HumSet is curated by humanitarian analysts and covers various disasters around the globe that occurred from 2018 to 2021 in 46 humanitarian response projects. The dataset consists of approximately 17K annotated documents in three languages of English, French, and Spanish, originally taken from publicly-available resources. For each document, analysts have identified informative snippets (entries) in respect to common humanitarian frameworks, and assigned one or many classes to each entry. See the our paper for details.\n", "dataset_name": "nlp-thedeep/humset"}, "2.0.0": {"config_name": "2.0.0", "sample_row": "{\"entry_id\": \"\\\"150780\\\"\", \"lead_id\": \"\\\"37921\\\"\", \"lang\": \"\\\"fr\\\"\", \"n_tokens\": \"662\", \"project_title\": \"\\\"IMMAP/DFS Burkina Faso\\\"\", \"created_at\": \"\\\"2020-08-31 14:28:01.040379+00:00\\\"\", \"document\": \"\\\"Unknown\\\"\", \"source_title\": \"\\\"Displacement Tracking Matrix, IOM\\\"\", \"author_title\": \"\\\"Unknown\\\"\", \"excerpt\": \"\\\"En plus du fret, les deux a\\\\u00e9roports principa...\", \"geo_location\": \"[\\\"Boucle du Mouhoun\\\", \\\"Cascades\\\", \\\"Centre\\\", \\\"Centr...\", \"sectors\": \"[\\\"Logistics\\\"]\", \"pillars_1d\": \"[\\\"Covid-19\\\"]\", \"pillars_2d\": \"[\\\"Impact\\\"]\", \"subpillars_1d\": \"[\\\"Covid-19->Restriction Measures\\\"]\", \"subpillars_2d\": \"[\\\"Impact->Impact On Systems, Services And Networks...\", \"displaced\": \"[]\", \"non_displaced\": \"[]\", \"affected\": \"[]\", \"severity\": \"[]\", \"age\": \"[]\", \"gender\": \"[]\", \"specific_needs_groups\": \"[]\"}", "columns": ["entry_id", "lead_id", "lang", "n_tokens", "project_title", "created_at", "document", "source_title", "author_title", "excerpt", "geo_location", "sectors", "pillars_1d", "pillars_2d", "subpillars_1d", "subpillars_2d", "displaced", "non_displaced", "affected", "severity", "age", "gender", "specific_needs_groups"], "columns_mapping": {"entry_id": "entry_id", "lead_id": "lead_id", "lang": "lang", "n_tokens": "n_tokens", "project_title": "project_title", "created_at": "created_at", "document": "document", "source_title": "source_title", "author_title": "author_title", "excerpt": "excerpt", "geo_location": "geo_location", "sectors": "sectors", "pillars_1d": "pillars_1d", "pillars_2d": "pillars_2d", "subpillars_1d": "subpillars_1d", "subpillars_2d": "subpillars_2d", "displaced": "displaced", "non_displaced": "non_displaced", "affected": "affected", "severity": "severity", "age": "age", "gender": "gender", "specific_needs_groups": "specific_needs_groups"}, "dataset_description": "HumSet is a novel and rich multilingual dataset of humanitarian response documents annotated by experts in the humanitarian response community. HumSet is curated by humanitarian analysts and covers various disasters around the globe that occurred from 2018 to 2021 in 46 humanitarian response projects. The dataset consists of approximately 17K annotated documents in three languages of English, French, and Spanish, originally taken from publicly-available resources. For each document, analysts have identified informative snippets (entries) in respect to common humanitarian frameworks, and assigned one or many classes to each entry. See the our paper for details.\n", "dataset_name": "nlp-thedeep/humset"}}, "tags": ["task_categories:text-classification", "task_categories:text-retrieval", "task_categories:token-classification", "task_ids:multi-label-classification", "annotations_creators:expert-generated", "multilinguality:multilingual", "source_datasets:original", "language:en", "language:fr", "language:es", "humanitarian", "research", "analytical-framework", "multilabel", "humset", "humbert"], "is_gated": false}, "ruanchaves/hatebr": {"dataset_name": "ruanchaves/hatebr", "description": "HateBR is the first large-scale expert annotated corpus of Brazilian Instagram comments for hate speech and offensive language detection on the web and social media. The HateBR corpus was collected from Brazilian Instagram comments of politicians and manually annotated by specialists. It is composed of 7,000 documents annotated according to three different layers: a binary classification (offensive versus non-offensive comments), offensiveness-level (highly, moderately, and slightly offensive messages), and nine hate speech groups (xenophobia, racism, homophobia, sexism, religious intolerance, partyism, apology for the dictatorship, antisemitism, and fatphobia). Each comment was annotated by three different annotators and achieved high inter-annotator agreement. Furthermore, baseline experiments were implemented reaching 85% of F1-score outperforming the current literature models for the Portuguese language. Accordingly, we hope that the proposed expertly annotated corpus may foster research on hate speech and offensive language detection in the Natural Language Processing area.", "downloads": 24, "configs": {"default": {"config_name": "default", "sample_row": "{\"instagram_comments\": \"\\\"este lixo ...\\\"\", \"offensive_language\": \"true\", \"offensiveness_levels\": \"1\", \"antisemitism\": \"false\", \"apology_for_the_dictatorship\": \"false\", \"fatphobia\": \"false\", \"homophobia\": \"false\", \"partyism\": \"false\", \"racism\": \"false\", \"religious_intolerance\": \"false\", \"sexism\": \"false\", \"xenophobia\": \"false\", \"offensive_&_non-hate_speech\": \"true\", \"non-offensive\": \"false\", \"specialist_1_hate_speech\": \"false\", \"specialist_2_hate_speech\": \"false\", \"specialist_3_hate_speech\": \"false\"}", "columns": ["instagram_comments", "offensive_language", "offensiveness_levels", "antisemitism", "apology_for_the_dictatorship", "fatphobia", "homophobia", "partyism", "racism", "religious_intolerance", "sexism", "xenophobia", "offensive_&_non-hate_speech", "non-offensive", "specialist_1_hate_speech", "specialist_2_hate_speech", "specialist_3_hate_speech"], "columns_mapping": {"instagram_comments": "instagram_comments", "offensive_language": "offensive_language", "offensiveness_levels": "offensiveness_levels", "antisemitism": "antisemitism", "apology_for_the_dictatorship": "apology_for_the_dictatorship", "fatphobia": "fatphobia", "homophobia": "homophobia", "partyism": "partyism", "racism": "racism", "religious_intolerance": "religious_intolerance", "sexism": "sexism", "xenophobia": "xenophobia", "offensive_&_non-hate_speech": "offensive_&_non-hate_speech", "non-offensive": "non-offensive", "specialist_1_hate_speech": "specialist_1_hate_speech", "specialist_2_hate_speech": "specialist_2_hate_speech", "specialist_3_hate_speech": "specialist_3_hate_speech"}, "dataset_description": "\nHateBR is the first large-scale expert annotated corpus of Brazilian Instagram comments for hate speech and offensive language detection on the web and social media. The HateBR corpus was collected from Brazilian Instagram comments of politicians and manually annotated by specialists. It is composed of 7,000 documents annotated according to three different layers: a binary classification (offensive versus non-offensive comments), offensiveness-level (highly, moderately, and slightly offensive messages), and nine hate speech groups (xenophobia, racism, homophobia, sexism, religious intolerance, partyism, apology for the dictatorship, antisemitism, and fatphobia). Each comment was annotated by three different annotators and achieved high inter-annotator agreement. Furthermore, baseline experiments were implemented reaching 85% of F1-score outperforming the current literature models for the Portuguese language. Accordingly, we hope that the proposed expertly annotated corpus may foster research on hate speech and offensive language detection in the Natural Language Processing area.\n", "dataset_name": "ruanchaves/hatebr"}}, "tags": ["task_categories:text-classification", "task_ids:hate-speech-detection", "annotations_creators:expert-generated", "multilinguality:monolingual", "source_datasets:original", "language:pt", "instagram", "doi:10.57967/hf/0274"], "is_gated": false}, "bigcode/commitpack": {"dataset_name": "bigcode/commitpack", "description": "CommitPack is is a 4TB dataset of commits scraped from GitHub repositories that are permissively licensed.", "downloads": 141, "configs": {"json": {"config_name": "json", "sample_row": "{\"commit\": \"\\\"13aeb023f68d105d167f308c16957c507967b490\\\"\", \"old_file\": \"\\\"contributors.json\\\"\", \"new_file\": \"\\\"contributors.json\\\"\", \"old_contents\": \"\\\"[\\\\n {\\\\n \\\\\\\"prNum\\\\\\\": 847,\\\\n \\\\\\\"time\\\\\\\": \\\\\\\"2014-08-1...\", \"new_contents\": \"\\\"[\\\\n {\\\\n \\\\\\\"prNum\\\\\\\": 848,\\\\n \\\\\\\"time\\\\\\\": \\\\\\\"2014-08-1...\", \"subject\": \"\\\"Added @5290charlie\\\"\", \"message\": \"\\\"Added @5290charlie\\\"\", \"lang\": \"\\\"JSON\\\"\", \"license\": \"\\\"bsd-2-clause\\\"\", \"repos\": \"\\\"shafayeatsumit/patchwork,jmb521/patchwork,contact...\"}", "columns": ["commit", "old_file", "new_file", "old_contents", "new_contents", "subject", "message", "lang", "license", "repos"], "columns_mapping": {"commit": "commit", "old_file": "old_file", "new_file": "new_file", "old_contents": "old_contents", "new_contents": "new_contents", "subject": "subject", "message": "message", "lang": "lang", "license": "license", "repos": "repos"}, "dataset_description": "CommitPack is is a 4TB dataset of commits scraped from GitHub repositories that are permissively licensed.\n", "dataset_name": "bigcode/commitpack"}, "xml": {"config_name": "xml", "sample_row": "{\"commit\": \"\\\"285637f64b964f8aab3866ac6f44549620cdbd20\\\"\", \"old_file\": \"\\\"pom.xml\\\"\", \"new_file\": \"\\\"pom.xml\\\"\", \"old_contents\": \"\\\"\\\\n\\\\r\\\\n

\\\\r\\\\n
\\\\n\\\\n\\\\n\\\\n to examine.\\\"\", \"message\": \"\\\"Fix typo: to examining -> to examine.\\\\n\\\"\", \"lang\": \"\\\"Groff\\\"\", \"license\": \"\\\"bsd-3-clause\\\"\", \"repos\": \"\\\"jrobhoward/SCADAbase,jrobhoward/SCADAbase,jrobhow...\"}", "columns": ["commit", "old_file", "new_file", "old_contents", "new_contents", "subject", "message", "lang", "license", "repos"], "columns_mapping": {"commit": "commit", "old_file": "old_file", "new_file": "new_file", "old_contents": "old_contents", "new_contents": "new_contents", "subject": "subject", "message": "message", "lang": "lang", "license": "license", "repos": "repos"}, "dataset_description": "CommitPackFT is is a 2GB filtered version of CommitPack to contain only high-quality commit messages that resemble natural language instructions.\n", "dataset_name": "bigcode/commitpackft"}, "groovy": {"config_name": "groovy", "sample_row": "{\"commit\": \"\\\"d4967cd2b865160ad756ae143af14772a286d255\\\"\", \"old_file\": \"\\\"subprojects/integ-test/src/integTest/groovy/org/g...\", \"new_file\": \"\\\"subprojects/integ-test/src/integTest/groovy/org/g...\", \"old_contents\": \"\\\"/*\\\\n * Copyright 2013 the original author or auth...\", \"new_contents\": \"\\\"/*\\\\n * Copyright 2013 the original author or auth...\", \"subject\": \"\\\"Fix usage of now removed 'cpp-lib' and 'cpp-exe' ...\", \"message\": \"\\\"Fix usage of now removed 'cpp-lib' and 'cpp-exe' ...\", \"lang\": \"\\\"Groovy\\\"\", \"license\": \"\\\"apache-2.0\\\"\", \"repos\": \"\\\"gradle/gradle,blindpirate/gradle,blindpirate/grad...\"}", "columns": ["commit", "old_file", "new_file", "old_contents", "new_contents", "subject", "message", "lang", "license", "repos"], "columns_mapping": {"commit": "commit", "old_file": "old_file", "new_file": "new_file", "old_contents": "old_contents", "new_contents": "new_contents", "subject": "subject", "message": "message", "lang": "lang", "license": "license", "repos": "repos"}, "dataset_description": "CommitPackFT is is a 2GB filtered version of CommitPack to contain only high-quality commit messages that resemble natural language instructions.\n", "dataset_name": "bigcode/commitpackft"}, "groovy-server-pages": {"config_name": "groovy-server-pages", "sample_row": "{\"commit\": \"\\\"d8917646df8c673944db19176a58c0e38a8c076e\\\"\", \"old_file\": \"\\\"grails-app/views/home/templates/_projects.gsp\\\"\", \"new_file\": \"\\\"grails-app/views/home/templates/_projects.gsp\\\"\", \"old_contents\": \"\\\"