From 678ca1e1eaf47ac085fef316015c26763cb93f82 Mon Sep 17 00:00:00 2001
From: h1alexbel <aliaksei.bialiauski@hey.com>
Date: Thu, 6 Jun 2024 13:26:27 +0300
Subject: [PATCH 1/5] feat(#153): inits

---
 models/model/__init__.py            | 21 +++++++++++++++++++++
 models/model/pre/__init__.py        | 21 +++++++++++++++++++++
 models/model/pre/pre_description.py |  4 +---
 models/model/pre/pre_name.py        |  4 +---
 models/model/pre/pre_readme.py      |  6 ++----
 models/model/pre/pre_topics.py      |  4 +---
 6 files changed, 47 insertions(+), 13 deletions(-)
 create mode 100644 models/model/__init__.py
 create mode 100644 models/model/pre/__init__.py

diff --git a/models/model/__init__.py b/models/model/__init__.py
new file mode 100644
index 0000000..1351e23
--- /dev/null
+++ b/models/model/__init__.py
@@ -0,0 +1,21 @@
+# The MIT License (MIT)
+#
+# Copyright (c) 2024 Aliaksei Bialiauski
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/models/model/pre/__init__.py b/models/model/pre/__init__.py
new file mode 100644
index 0000000..1351e23
--- /dev/null
+++ b/models/model/pre/__init__.py
@@ -0,0 +1,21 @@
+# The MIT License (MIT)
+#
+# Copyright (c) 2024 Aliaksei Bialiauski
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/models/model/pre/pre_description.py b/models/model/pre/pre_description.py
index 3a335a8..2bb07a9 100644
--- a/models/model/pre/pre_description.py
+++ b/models/model/pre/pre_description.py
@@ -40,6 +40,4 @@ def tokens(self):
         stops = set(stopwords.words('english'))
         filtered = [word for word in tokens if word not in stops]
         lemmatizer = WordNetLemmatizer()
-        ready = [lemmatizer.lemmatize(word, pos='v') for word in filtered]
-        print(f"Preprocessed {self.text} to: {ready}")
-        return ready
+        return [lemmatizer.lemmatize(word, pos='v') for word in filtered]
diff --git a/models/model/pre/pre_name.py b/models/model/pre/pre_name.py
index c2b5e58..4854b07 100644
--- a/models/model/pre/pre_name.py
+++ b/models/model/pre/pre_name.py
@@ -39,6 +39,4 @@ def tokens(self):
         stops = set(stopwords.words('english'))
         filtered = [word for word in tokens if word not in stops]
         lemmatizer = WordNetLemmatizer()
-        ready = [lemmatizer.lemmatize(word) for word in filtered]
-        print(f"Preprocessed {self.origin} to: {ready}")
-        return ready
+        return [lemmatizer.lemmatize(word) for word in filtered]
diff --git a/models/model/pre/pre_readme.py b/models/model/pre/pre_readme.py
index f2504ab..63de12b 100644
--- a/models/model/pre/pre_readme.py
+++ b/models/model/pre/pre_readme.py
@@ -34,7 +34,7 @@ def __init__(self, content):
         self.content = content
 
     def tokens(self):
-        lower = self.content.lower()
+        lower = str(self.content).lower()
         no_tags = re.sub(r'<.*?>', '', lower)
         no_puncts = re.sub(r'[^\w\s]', '', no_tags)
         tokens = word_tokenize(no_puncts)
@@ -42,6 +42,4 @@ def tokens(self):
         stops.update(['b', 'bash'])
         filtered = [word for word in tokens if word not in stops]
         lemmatizer = WordNetLemmatizer()
-        ready = [lemmatizer.lemmatize(word, pos='v') for word in filtered]
-        print(f"Preprocessed {self.content} to: {ready}")
-        return ready
+        return [lemmatizer.lemmatize(word, pos='v') for word in filtered]
diff --git a/models/model/pre/pre_topics.py b/models/model/pre/pre_topics.py
index 2241b5c..982ef01 100644
--- a/models/model/pre/pre_topics.py
+++ b/models/model/pre/pre_topics.py
@@ -39,6 +39,4 @@ def tokens(self):
         stops = set(stopwords.words('english'))
         split = [topic for topic in split if topic not in stops]
         lemmatizer = WordNetLemmatizer()
-        ready = [lemmatizer.lemmatize(topic) for topic in split]
-        print(f"Preprocessed {self.topics} to {ready}")
-        return ready
+        return [lemmatizer.lemmatize(topic) for topic in split]

From 0ce617fa2306e72edd4a65c1e9ae73984e480c3f Mon Sep 17 00:00:00 2001
From: h1alexbel <aliaksei.bialiauski@hey.com>
Date: Thu, 6 Jun 2024 16:37:09 +0300
Subject: [PATCH 2/5] feat(#153): input_ids from raw text

---
 models/model/pre/embeddings.py            | 48 +++++++----------------
 models/model_tests/pre/test_embeddings.py | 25 ++++++++----
 2 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/models/model/pre/embeddings.py b/models/model/pre/embeddings.py
index 49a4cfd..89493f6 100644
--- a/models/model/pre/embeddings.py
+++ b/models/model/pre/embeddings.py
@@ -19,7 +19,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-from transformers import BertTokenizer, BertModel
+from transformers import BertTokenizer, BertModel, AutoTokenizer
 import torch
 
 """
@@ -27,39 +27,21 @@
 """
 
 
+# text -> numerical representations -> vector
+# 768, defined by the BERT architecture
 class Embeddings:
-    def __init__(self, tokens, length, encoder="bert-base-uncased"):
-        self.tokens = tokens
+    def __init__(self, raw, length, tokenizer):
+        self.raw = raw
         self.length = length
-        self.tokenizer = BertTokenizer.from_pretrained(encoder)
-        self.model = BertModel.from_pretrained(encoder)
+        self.tokenizer = tokenizer
 
     def embed(self):
-        print(f"Generating embeddings for {self.tokens}")
-        print(f"Encoder: {self.tokenizer}, {self.model}, output length: {self.length}")
-        inputs = []
-        masks = []
-        # @todo #143:30min We generate embeddings for each token instead of the whole unit.
-        #   For now, we generate embeddings for each token. We probably should
-        #   generate embeddings for joined tokens as one unit. In this case we
-        #   can try to replace preprocessing steps with a huggingface tokenizers.
-        #   Let's validate this assumption.
-        for tokens in self.tokens:
-            ids = self.tokenizer.encode_plus(
-                tokens,
-                add_special_tokens=True,
-                return_tensors='pt',
-                padding='max_length',
-                truncation=True,
-                max_length=self.length
-            )
-            inputs.append(ids["input_ids"])
-            masks.append(ids["attention_mask"])
-        inputs = torch.cat(inputs, dim=0)
-        masks = torch.cat(masks, dim=0)
-        with torch.no_grad():
-            outputs = self.model(inputs, attention_mask=masks)
-            states = outputs.last_hidden_state
-        embeddings = states[0].numpy()
-        print(f"Generated embeddings {embeddings}")
-        return embeddings
+        tokens = self.tokenizer.tokenize(
+            self.raw,
+            padding=True,
+            truncation=True,
+            return_tensors='pt'
+        )
+        ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        final = self.tokenizer.prepare_for_model(ids)
+        return final
diff --git a/models/model_tests/pre/test_embeddings.py b/models/model_tests/pre/test_embeddings.py
index f9fc378..b0894bd 100644
--- a/models/model_tests/pre/test_embeddings.py
+++ b/models/model_tests/pre/test_embeddings.py
@@ -21,6 +21,8 @@
 # SOFTWARE.
 import unittest
 
+from transformers import AutoTokenizer
+
 from model.pre.embeddings import Embeddings
 
 """
@@ -30,14 +32,21 @@
 
 class TestEmbeddings(unittest.TestCase):
 
-    def test_generates_embeddings_for_tokens(self):
-        shape = Embeddings(
-            ["apache", "kafka", "examples", "learning"],
-            4
-        ).embed().shape
-        expected = (4, 768)
+    def test_generates_embeddings_for_raw_text(self):
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        inputs = Embeddings(
+            "apache/kafka-learning-examples", 4, tokenizer
+        ).embed()["input_ids"]
+        expected = [101, 15895, 1013, 10556, 24316, 2050, 1011, 4083, 1011, 4973, 102]
         self.assertEqual(
-            shape,
+            inputs,
             expected,
-            f"received matrix's shape {shape} does not match with expected {expected}"
+            f"Generated input IDs {inputs} do not match with expected {expected}"
+        )
+        back = tokenizer.decode(inputs)
+        typed = "[CLS] apache / kafka - learning - examples [SEP]"
+        self.assertEqual(
+            back,
+            typed,
+            f"Decoded input IDs {back} do not match with expected {typed}"
         )

From 77b1e97edef56d8ec0f2459cb76055854df13ad8 Mon Sep 17 00:00:00 2001
From: h1alexbel <aliaksei.bialiauski@hey.com>
Date: Thu, 6 Jun 2024 17:57:16 +0300
Subject: [PATCH 3/5] feat(#153): 78-sized vector from tokenizer

---
 models/model/pre/pipeline.py            | 33 ++++++++++++-------------
 models/model_tests/pre/test_pipeline.py |  9 ++++---
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/models/model/pre/pipeline.py b/models/model/pre/pipeline.py
index c7c3c10..76844fa 100644
--- a/models/model/pre/pipeline.py
+++ b/models/model/pre/pipeline.py
@@ -21,10 +21,6 @@
 # SOFTWARE.
 
 from model.pre.embeddings import Embeddings
-from model.pre.pre_description import PreDescription
-from model.pre.pre_name import PreName
-from model.pre.pre_readme import PreReadme
-from model.pre.pre_topics import PreTopics
 from model.pre.vector import Vector
 
 """
@@ -39,25 +35,28 @@ class Pipeline:
     :param repository Repository to vectorize
     """
 
-    def __init__(self, repository):
+    def __init__(self, repository, tokenizer):
         self.repository = repository
+        self.tokenizer = tokenizer
 
     def apply(self):
         name = self.repository["name"]
         print(f"processing {name}")
-        name = PreName(name).tokens()
-        readme = PreReadme(self.repository["readme"]).tokens()
-        description = PreDescription(self.repository["description"]).tokens()
-        topics = PreTopics(self.repository["topics"]).tokens()
-        e_name = Embeddings(name, 30).embed()
-        e_readme = Embeddings(readme, 512).embed()
-        e_description = Embeddings(description, 100).embed()
-        e_topics = Embeddings(topics, 100).embed()
         return Vector(
-            e_name,
-            e_readme,
-            e_description,
-            e_topics,
+            Embeddings(name, 30, self.tokenizer).embed()["input_ids"],
+            Embeddings(
+                self.repository["readme"],
+                512,
+                self.tokenizer
+            ).embed()["input_ids"],
+            Embeddings(
+                self.repository["description"],
+                100,
+                self.tokenizer
+            ).embed()["input_ids"],
+            Embeddings(
+                self.repository["topics"], 100, self.tokenizer
+            ).embed()["input_ids"],
             cpd=self.repository["cpd"],
             rc=self.repository["rc"],
             ic=self.repository["ic"]
diff --git a/models/model_tests/pre/test_pipeline.py b/models/model_tests/pre/test_pipeline.py
index b005f9a..539db20 100644
--- a/models/model_tests/pre/test_pipeline.py
+++ b/models/model_tests/pre/test_pipeline.py
@@ -22,6 +22,7 @@
 import unittest
 
 from model.pre.pipeline import Pipeline
+from transformers import AutoTokenizer
 
 """
 Test cases for Pipeline.
@@ -51,12 +52,13 @@ def test_vectorizes_repository(self):
             ```
             """,
             "description": "fakehub description",
-            "topics": ["rust", "github", "mock-api", "testing"],
+            "topics": "rust,github,mock-api,testing",
             "cpd": 5.2,
             "rc": 0.04,
             "ic": 0.25
         }
-        vector = Pipeline(repository).apply().tolist()
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        vector = Pipeline(repository, tokenizer).apply().tolist()
         size = len(vector)
         cpd = repository["cpd"]
         has_cpd = cpd in vector
@@ -64,7 +66,7 @@ def test_vectorizes_repository(self):
         has_rc = rc in vector
         ic = repository["ic"]
         has_ic = ic in vector
-        expected = 569859
+        expected = 78
         self.assertEqual(
             size,
             expected,
@@ -82,4 +84,3 @@ def test_vectorizes_repository(self):
             has_ic,
             f"received vector {vector} does not have IC value: {ic}, but should"
         )
-

From 5c90ff7f1b90ed566a432b81f7709dc063125b82 Mon Sep 17 00:00:00 2001
From: h1alexbel <aliaksei.bialiauski@hey.com>
Date: Thu, 6 Jun 2024 17:58:16 +0300
Subject: [PATCH 4/5] feat(#153): no nltk pre

---
 models/model/pre/pre_description.py           | 43 ---------------
 models/model/pre/pre_name.py                  | 42 --------------
 models/model/pre/pre_readme.py                | 45 ---------------
 models/model/pre/pre_topics.py                | 42 --------------
 .../model_tests/pre/test_pre_description.py   | 41 --------------
 models/model_tests/pre/test_pre_name.py       | 41 --------------
 models/model_tests/pre/test_pre_readme.py     | 55 -------------------
 models/model_tests/pre/test_pre_topics.py     | 40 --------------
 8 files changed, 349 deletions(-)
 delete mode 100644 models/model/pre/pre_description.py
 delete mode 100644 models/model/pre/pre_name.py
 delete mode 100644 models/model/pre/pre_readme.py
 delete mode 100644 models/model/pre/pre_topics.py
 delete mode 100644 models/model_tests/pre/test_pre_description.py
 delete mode 100644 models/model_tests/pre/test_pre_name.py
 delete mode 100644 models/model_tests/pre/test_pre_readme.py
 delete mode 100644 models/model_tests/pre/test_pre_topics.py

diff --git a/models/model/pre/pre_description.py b/models/model/pre/pre_description.py
deleted file mode 100644
index 2bb07a9..0000000
--- a/models/model/pre/pre_description.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import re
-
-from nltk import word_tokenize, WordNetLemmatizer
-from nltk.corpus import stopwords
-
-"""
-Repository description preprocessing.
-"""
-
-
-class PreDescription:
-    def __init__(self, text):
-        self.text = text
-
-    def tokens(self):
-        lower = self.text.lower()
-        no_puncts = re.sub(r'[^\w\s]', '', lower)
-        tokens = word_tokenize(no_puncts)
-        stops = set(stopwords.words('english'))
-        filtered = [word for word in tokens if word not in stops]
-        lemmatizer = WordNetLemmatizer()
-        return [lemmatizer.lemmatize(word, pos='v') for word in filtered]
diff --git a/models/model/pre/pre_name.py b/models/model/pre/pre_name.py
deleted file mode 100644
index 4854b07..0000000
--- a/models/model/pre/pre_name.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import re
-
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-
-"""
-Repository name preprocessing.
-"""
-
-
-class PreName:
-    def __init__(self, origin):
-        self.origin = origin
-
-    def tokens(self):
-        name = self.origin.lower()
-        tokens = re.split(r'[/\-_]', name)
-        stops = set(stopwords.words('english'))
-        filtered = [word for word in tokens if word not in stops]
-        lemmatizer = WordNetLemmatizer()
-        return [lemmatizer.lemmatize(word) for word in filtered]
diff --git a/models/model/pre/pre_readme.py b/models/model/pre/pre_readme.py
deleted file mode 100644
index 63de12b..0000000
--- a/models/model/pre/pre_readme.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import re
-from nltk import WordNetLemmatizer
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-
-"""
-Repository README preprocessing.
-"""
-
-
-class PreReadme:
-    def __init__(self, content):
-        self.content = content
-
-    def tokens(self):
-        lower = str(self.content).lower()
-        no_tags = re.sub(r'<.*?>', '', lower)
-        no_puncts = re.sub(r'[^\w\s]', '', no_tags)
-        tokens = word_tokenize(no_puncts)
-        stops = set(stopwords.words('english'))
-        stops.update(['b', 'bash'])
-        filtered = [word for word in tokens if word not in stops]
-        lemmatizer = WordNetLemmatizer()
-        return [lemmatizer.lemmatize(word, pos='v') for word in filtered]
diff --git a/models/model/pre/pre_topics.py b/models/model/pre/pre_topics.py
deleted file mode 100644
index 982ef01..0000000
--- a/models/model/pre/pre_topics.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import re
-
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-
-"""
-Repository topics preprocessing.
-"""
-
-
-class PreTopics:
-    def __init__(self, topics):
-        self.topics = topics
-
-    def tokens(self):
-        lower = [topic.lower() for topic in self.topics]
-        split = [re.sub(r'[^a-z0-9\s]', '', topic) for topic in lower]
-        stops = set(stopwords.words('english'))
-        split = [topic for topic in split if topic not in stops]
-        lemmatizer = WordNetLemmatizer()
-        return [lemmatizer.lemmatize(topic) for topic in split]
diff --git a/models/model_tests/pre/test_pre_description.py b/models/model_tests/pre/test_pre_description.py
deleted file mode 100644
index 2fbe04c..0000000
--- a/models/model_tests/pre/test_pre_description.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import unittest
-
-from model.pre.pre_description import PreDescription
-
-"""
-Test cases for PreDescription.
-"""
-
-
-class TestPreDescription(unittest.TestCase):
-
-    def test_preprocess_description(self):
-        input = "This repository hosts Java examples"
-        tokens = PreDescription(input).tokens()
-        expected = ["repository", "host", "java", "examples"]
-        self.assertEqual(
-            tokens,
-            expected,
-            f"received tokens {tokens} for input: {input} do not match with expected {expected}"
-        )
diff --git a/models/model_tests/pre/test_pre_name.py b/models/model_tests/pre/test_pre_name.py
deleted file mode 100644
index bcdf341..0000000
--- a/models/model_tests/pre/test_pre_name.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import unittest
-
-from model.pre.pre_name import PreName
-
-"""
-Test cases for PreName.
-"""
-
-
-class TestPreName(unittest.TestCase):
-
-    def test_preprocesses_name(self):
-        input = "streaming-with-flink/examples-java"
-        tokens = PreName(input).tokens()
-        expected = ["streaming", "flink", "example", "java"]
-        self.assertEqual(
-            tokens,
-            expected,
-            f"received tokens {tokens} for input: {input} do not match with expected {expected}"
-        )
diff --git a/models/model_tests/pre/test_pre_readme.py b/models/model_tests/pre/test_pre_readme.py
deleted file mode 100644
index 42df9bc..0000000
--- a/models/model_tests/pre/test_pre_readme.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import unittest
-
-from model.pre.pre_readme import PreReadme
-
-"""
-Test cases for PreReadme.
-"""
-
-
-class TestPreReadme(unittest.TestCase):
-
-    def test_preprocesses_readme_in_tokens(self):
-        tokens = PreReadme("""
-            ## Java Examples for Stream Processing with Apache Flink
-
-            This repository hosts Java code examples for
-            ["Stream Processing with Apache Flink"](//link).
-
-            **Note:** The Java examples are not complete yet. <br>
-            The [Scala examples](#scala) placed here.
-            """).tokens()
-        expected = [
-            "java", "examples", "stream", "process",
-            "apache", "flink", "repository", "host",
-            "java", "code", "examples", "stream",
-            "process", "apache", "flinklink", "note",
-            "java", "examples", "complete", "yet",
-            "scala", "examplesscala", "place"
-        ]
-        self.assertEqual(
-            tokens,
-            expected,
-            f"received tokens {tokens} do not match with expected {expected}"
-        )
diff --git a/models/model_tests/pre/test_pre_topics.py b/models/model_tests/pre/test_pre_topics.py
deleted file mode 100644
index 71a5409..0000000
--- a/models/model_tests/pre/test_pre_topics.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# The MIT License (MIT)
-#
-# Copyright (c) 2024 Aliaksei Bialiauski
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import unittest
-
-from model.pre.pre_topics import PreTopics
-
-"""
-Test cases for PreTopics.
-"""
-
-
-class TestPreTopics(unittest.TestCase):
-
-    def test_preprocesses_topics(self):
-        tokens = PreTopics(["java", "examples", "flink", "streaming"]).tokens()
-        expected = ["java", "example", "flink", "streaming"]
-        self.assertEqual(
-            tokens,
-            expected,
-            f"received tokens {tokens} do not match with expected {expected}"
-        )

From 269346e1036d032aceacfc8459c2da496da62805 Mon Sep 17 00:00:00 2001
From: h1alexbel <aliaksei.bialiauski@hey.com>
Date: Thu, 6 Jun 2024 18:23:44 +0300
Subject: [PATCH 5/5] doc(#153): no struct

---
 models/README.md | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/models/README.md b/models/README.md
index ed505bd..acc3e15 100644
--- a/models/README.md
+++ b/models/README.md
@@ -37,19 +37,9 @@ You will need [Docker] installed.
 
 ## How to build new dataset?
 
-Dataset used for model training are located here:
-[train.csv](https://github.com/h1alexbel/samples-filter/blob/dataset/train.csv)
-To refresh it, run [srdataset] either on cloud VM or locally. The building
-process can take a while. After it completed, you should have `dataset.csv`
-file with all collected repositories with the following structure:
-
-* `name`: repository full name, e.g. `redisson/redisson-examples`.
-* `readme`: repository README.md file.
-* `description`: repository description.
-* `topics`: a set of repository topics, e.g. `[apache, streaming, kafka]`
-* `CPD`: commits per day calculated metric.
-* `RC`: published releases to commits ratio.
-* `IC`: issues to commits ratio.
+To build a new dataset, run [srdataset] either on cloud VM or locally. The
+building process can take a while. After it completed, you should have
+`repos.csv` file with all collected repositories.
 
 All features must be preprocessed and vectorized using [pipeline.py].
 Once you have vectors, you can [feed](#how-to-train-it) them to the models.