-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathStaticStopwordRemover.py
38 lines (23 loc) · 1.06 KB
/
StaticStopwordRemover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import Utility
from RuleBasedTokenizer import RuleBasedTokenizer
# NOT ONEMLİ: Stopword listesi https://github.com/ahmetax/trstop adresinden alındı.
# Kaynak belirtmek gerekir. Onun dışında o adresten kod kullanılmadı.
class StaticStopwordRemover:
def __init__(self):
self.Tokenizer = RuleBasedTokenizer()
self.stop_words_list = Utility.load_words('./DATA/turkce-stop-words.txt')
def remove_stopwords(self, sentence):
word_tokens = self.Tokenizer.tokenize(sentence)
filtered_sentence = []
for w in word_tokens:
if w not in self.stop_words_list:
filtered_sentence.append(w)
return word_tokens, filtered_sentence
#def main():
# stop_words = load_words('turkce-stop-words') # Stop wordleri import et
# example_sent = """Deneme deneme ama bu şu o bu bir cümle ve belki bu bir cümle"""
# word_tokens, filtered_sentence = StopwordRemover.remove_stopwords(example_sent, stop_words)
# print(word_tokens)
# print(filtered_sentence)
#if __name__ == '__main__':
# main()