-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata.py
139 lines (111 loc) · 5.27 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
This module can create a "lazy" HF dataset for bengali.
Lazy dataset can yield samples before the downloads samples in background while it is iterating.
"""
import random
import logging
from collections import defaultdict
from functools import partial
from multiprocessing import cpu_count
from typing import Sequence, Optional
import torch
from bnlp import NLTKTokenizer
from datasets import load_dataset, interleave_datasets
from transformers import AlbertTokenizerFast, AlbertTokenizer
from prefetch_generator import BackgroundGenerator
logger = logging.getLogger(__name__)
bnlp_separator = NLTKTokenizer()
def create_instances_from_document(tokenizer, document, max_seq_length):
"""Creates `TrainingInstance`s for a single document."""
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
segmented_sents = bnlp_separator.sentence_tokenize(document.replace("।", " । "))
for i, sent in enumerate(segmented_sents):
current_chunk.append(sent)
current_length += len(tokenizer.tokenize(sent))
if i == len(segmented_sents) - 1 or current_length >= max_seq_length:
if len(current_chunk) > 1:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = random.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.append(current_chunk[j])
tokens_b = []
for j in range(a_end, len(current_chunk)):
tokens_b.append(current_chunk[j])
if random.random() < 0.5:
# Random next
is_random_next = True
# in this case, we just swap tokens_a and tokens_b
tokens_a, tokens_b = tokens_b, tokens_a
else:
# Actual next
is_random_next = False
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
instance = tokenizer(
" ".join(tokens_a),
" ".join(tokens_b),
padding='max_length',
truncation="longest_first",
max_length=max_seq_length,
# We use this option because DataCollatorForLanguageModeling
# is more efficient when it receives the `special_tokens_mask`.
return_special_tokens_mask=True,
)
assert len(instance["input_ids"]) <= max_seq_length
instance["sentence_order_label"] = 1 if is_random_next else 0
instances.append(instance)
current_chunk = []
current_length = 0
return instances
def tokenize_function(tokenizer, examples):
# Remove empty texts
texts = (text for text in examples["text"] if len(text) > 0 and not text.isspace())
new_examples = defaultdict(list)
for text in texts:
instances = create_instances_from_document(tokenizer, text, max_seq_length=512)
for instance in instances:
for key, value in instance.items():
new_examples[key].append(value)
return new_examples
class WrappedIterableDataset(torch.utils.data.IterableDataset):
"""Wraps huggingface IterableDataset as pytorch IterableDataset, implement default methods for DataLoader"""
def __init__(self, hf_iterable, verbose: bool = True):
self.hf_iterable = hf_iterable
self.verbose = verbose
def __iter__(self):
started = False
logger.info("Pre-fetching training samples...")
while True:
for sample in BackgroundGenerator(iter(self.hf_iterable), max_prefetch=64):
if not started:
logger.info("Began iterating minibatches!")
started = True
yield sample
def make_lazy_wikioscar_dataset(
tokenizer,
probs: Sequence[float] = (0.23, 0.77),
shuffle_buffer_size: int = 10 ** 4,
shuffle_seed: Optional[int] = None,
preprocessing_batch_size: int = 256,
):
wiki = load_dataset("lhoestq/wikipedia_bn", split="train", streaming=True)
oscar = load_dataset("oscar", "unshuffled_deduplicated_bn", split="train", streaming=True)
# both should have the same columns
wiki = wiki.map(lambda x: {"text": x["text"], "orig": f"wiki[{x['title']}]"}, batched=True)
oscar = oscar.map(lambda x: {"text": x["text"], "orig": f"oscar[{x['id']}]"}, batched=True)
# merge, shuffle and set pytorch format
dataset = interleave_datasets([wiki, oscar], probabilities=list(probs))
dataset = dataset.shuffle(shuffle_buffer_size, seed=shuffle_seed)
# ^-- this creates a buffer of random examples that will be refilled in background
dataset = dataset.map(partial(tokenize_function, tokenizer), batched=True, batch_size=preprocessing_batch_size)
dataset = dataset.with_format("torch")
return WrappedIterableDataset(dataset)