-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
111 lines (88 loc) · 4.42 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import torch.nn.utils.rnn as rnn
from datasets import load_dataset, load_from_disk
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from torch.utils.data import DataLoader
class Opus100Dataset:
def __init__(self, dataset_path='opus-100-en-ko-torch',
tokenizer_path='tokenizer-opus-en-ko.json'):
if os.path.exists(tokenizer_path):
self.tokenizer = self.load_tokenizer_from_disk(tokenizer_path)
else:
self.tokenizer = self.init_tokenizer()
if os.path.exists(dataset_path):
self.dataset = self.load_dataset_from_disk(dataset_path)
else:
self.dataset = self.init_dataset()
self.train_tokenizer(tokenizer_path)
self.preprocess_dataset(dataset_path)
def init_dataset(self):
return load_dataset("Helsinki-NLP/opus-100", "en-ko")
def init_tokenizer(self):
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
normalizer = normalizers.Sequence([NFD(), StripAccents()])
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = Whitespace()
return tokenizer
def batch_iterator(self, batch_size=1000):
tok_dataset = self.dataset['train']
for batch in tok_dataset.iter(batch_size):
_batch = []
for item in batch['translation']:
_batch.append(item['en'])
_batch.append(item['ko'])
yield _batch
def train_tokenizer(self, tokenizer_path):
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
self.tokenizer.train_from_iterator(self.batch_iterator(),
trainer=trainer,
length=len(self.dataset))
self.tokenizer.save(tokenizer_path)
def preprocess_function(self, examples):
# 영어와 한국어 문장을 가져옴
en_texts = [item['en'] for item in examples['translation']]
ko_texts = [item['ko'] for item in examples['translation']]
en_encodings = self.tokenizer.encode_batch(en_texts)
ko_encodings = self.tokenizer.encode_batch(ko_texts)
return {
'en': en_texts,
'ko': ko_texts,
'en_ids': [encoding.ids for encoding in en_encodings],
'ko_ids': [encoding.ids for encoding in ko_encodings]
}
def preprocess_dataset(self, dataset_path):
self.dataset = self.dataset.map(self.preprocess_function,
batched=True,
remove_columns=self.dataset['train'].column_names)
self.dataset = self.dataset.with_format('torch')
self.dataset.save_to_disk(dataset_path)
def load_dataset_from_disk(self, dataset_path):
return load_from_disk(dataset_path)
def load_tokenizer_from_disk(self, tokenizer_path):
return Tokenizer.from_file(tokenizer_path)
def collate_fn(self, batch):
en_batch = [item['en_ids'] for item in batch]
ko_batch = [item['ko_ids'] for item in batch]
padded_en_batch = rnn.pad_sequence(en_batch, batch_first=True, padding_value=self.tokenizer.token_to_id("[PAD]"))
padded_ko_batch = rnn.pad_sequence(ko_batch, batch_first=True, padding_value=self.tokenizer.token_to_id("[PAD]"))
return padded_en_batch, padded_ko_batch
def get_dataloader(self, batch_size=32):
train_loader = DataLoader(self.dataset['train'], batch_size=batch_size, collate_fn=self.collate_fn)
valid_loader = DataLoader(self.dataset['validation'], batch_size=batch_size, collate_fn=self.collate_fn)
test_loader = DataLoader(self.dataset['test'], batch_size=batch_size, collate_fn=self.collate_fn)
return train_loader, valid_loader, test_loader
def get_max_len(self):
def get_max_len(seq):
nonlocal max_len
max_len = max(len(seq['en_ids']), len(seq['ko_ids']))
return seq
max_len = 0
self.dataset.map(get_max_len, batched=True)
return max_len
def get_vocab_size(self):
return self.tokenizer.get_vocab_size()