Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BarNER Dataset #3604

Merged
merged 4 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
NER_ARABIC_ANER,
NER_ARABIC_AQMAR,
NER_BASQUE,
NER_BAVARIAN_WIKI,
NER_CHINESE_WEIBO,
NER_DANISH_DANE,
NER_ENGLISH_MOVIE_COMPLEX,
Expand Down Expand Up @@ -477,6 +478,7 @@
"NER_ARABIC_ANER",
"NER_ARABIC_AQMAR",
"NER_BASQUE",
"NER_BAVARIAN_WIKI",
"NER_CHINESE_WEIBO",
"NER_DANISH_DANE",
"NER_ENGLISH_MOVIE_COMPLEX",
Expand Down
91 changes: 91 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5530,3 +5530,94 @@ def __init__(
corpora,
name="masakha-pos-" + "-".join(languages),
)


class NER_BAVARIAN_WIKI(ColumnCorpus):
def __init__(
self,
fine_grained: bool = False,
revision: str = "main",
base_path: Optional[Union[str, Path]] = None,
in_memory: bool = True,
**corpusargs,
) -> None:
"""Initialize the Bavarian NER Bavarian NER Dataset (BarNER).

The dataset was proposed in the 2024 LREC-COLING paper
"Sebastian, Basti, Wastl?! Recognizing Named Entities in Bavarian Dialectal Data" paper by Peng et al.
:param fine_grained: Defines if the fine-grained or coarse-grained (default) should be used.
:param revision: Defines the revision/commit of BarNER dataset, by default dataset from 'main' branch is used.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param in_memory: If True, keeps dataset in memory giving speedups in training.
"""
base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
dataset_name = self.__class__.__name__.lower()
data_folder = base_path / dataset_name
data_path = flair.cache_root / "datasets" / dataset_name

document_boundary_marker = "-DOCSTART-"

for split in ["train", "dev", "test"]:
# Get original version
original_split_filename = data_path / "original" / f"bar-wiki-{split}.tsv"
if not original_split_filename.is_file():
original_split_url = (
f"https://raw.githubusercontent.com/mainlp/BarNER/{revision}/data/BarNER-final/bar-wiki-{split}.tsv"
)
cached_path(original_split_url, data_path / "original")

# Add sentence boundary marker
modified_split_filename = data_path / f"bar-wiki-{split}.tsv"
if not modified_split_filename.is_file():
f_out = open(modified_split_filename, "w", encoding="utf-8")

with open(original_split_filename, encoding="utf-8") as f_p:
for line in f_p:
line = line.strip()
if line.startswith("# newdoc id = "):
f_out.write(f"{document_boundary_marker}\tO\n\n")
continue
if line.startswith("# "):
continue
f_out.write(f"{line}\n")
f_out.close()

columns = {0: "text", 1: "ner"}

label_name_map = None

if not fine_grained:
# Only allowed classes in course setting are: PER, LOC, ORG and MISC.
# All other NEs are normalized to O, except EVENT and WOA are normalized to MISC (cf. Table 3 of paper).
label_name_map = {
"EVENT": "MISC",
"EVENTderiv": "O",
"EVENTpart": "O",
"LANG": "O",
"LANGderiv": "O",
"LANGpart": "O",
"LOCderiv": "O",
"LOCpart": "O",
"MISCderiv": "O",
"MISCpart": "O",
"ORGderiv": "O",
"ORGpart": "O",
"PERderiv": "O",
"PERpart": "O",
"RELIGION": "O",
"RELIGIONderiv": "O",
"WOA": "MISC",
"WOAderiv": "O",
"WOApart": "O",
}

super().__init__(
data_folder,
columns,
in_memory=in_memory,
comment_symbol="# ",
document_separator_token="-DOCSTART-",
label_name_map=label_name_map,
**corpusargs,
)
23 changes: 23 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,29 @@ def test_german_mobie(tasks_base_path):
), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"


@pytest.mark.skip()
def test_bavarian_wiki(tasks_base_path):
corpus = flair.datasets.NER_BAVARIAN_WIKI()

ref_sentences = 3_577
ref_tokens = 75_690

actual_sentences = sum(
[1 for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)
actual_tokens = sum(
[len(sentence) for sentence in corpus.train + corpus.dev + corpus.test if sentence[0].text != "-DOCSTART-"]
)

assert ref_sentences == actual_sentences, (
f"Number of parsed sentences ({actual_sentences}) does not match with "
f"reported number of sentences ({ref_sentences})!"
)
assert (
ref_tokens == actual_tokens
), f"Number of parsed tokens ({actual_tokens}) does not match with reported number of tokens ({ref_tokens})!"


def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
corpus = MultiFileJsonlCorpus(
train_files=[tasks_base_path / "jsonl/train.jsonl"],
Expand Down