From 678ca1e1eaf47ac085fef316015c26763cb93f82 Mon Sep 17 00:00:00 2001 From: h1alexbel Date: Thu, 6 Jun 2024 13:26:27 +0300 Subject: [PATCH 1/5] feat(#153): inits --- models/model/__init__.py | 21 +++++++++++++++++++++ models/model/pre/__init__.py | 21 +++++++++++++++++++++ models/model/pre/pre_description.py | 4 +--- models/model/pre/pre_name.py | 4 +--- models/model/pre/pre_readme.py | 6 ++---- models/model/pre/pre_topics.py | 4 +--- 6 files changed, 47 insertions(+), 13 deletions(-) create mode 100644 models/model/__init__.py create mode 100644 models/model/pre/__init__.py diff --git a/models/model/__init__.py b/models/model/__init__.py new file mode 100644 index 0000000..1351e23 --- /dev/null +++ b/models/model/__init__.py @@ -0,0 +1,21 @@ +# The MIT License (MIT) +# +# Copyright (c) 2024 Aliaksei Bialiauski +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. diff --git a/models/model/pre/__init__.py b/models/model/pre/__init__.py new file mode 100644 index 0000000..1351e23 --- /dev/null +++ b/models/model/pre/__init__.py @@ -0,0 +1,21 @@ +# The MIT License (MIT) +# +# Copyright (c) 2024 Aliaksei Bialiauski +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. diff --git a/models/model/pre/pre_description.py b/models/model/pre/pre_description.py index 3a335a8..2bb07a9 100644 --- a/models/model/pre/pre_description.py +++ b/models/model/pre/pre_description.py @@ -40,6 +40,4 @@ def tokens(self): stops = set(stopwords.words('english')) filtered = [word for word in tokens if word not in stops] lemmatizer = WordNetLemmatizer() - ready = [lemmatizer.lemmatize(word, pos='v') for word in filtered] - print(f"Preprocessed {self.text} to: {ready}") - return ready + return [lemmatizer.lemmatize(word, pos='v') for word in filtered] diff --git a/models/model/pre/pre_name.py b/models/model/pre/pre_name.py index c2b5e58..4854b07 100644 --- a/models/model/pre/pre_name.py +++ b/models/model/pre/pre_name.py @@ -39,6 +39,4 @@ def tokens(self): stops = set(stopwords.words('english')) filtered = [word for word in tokens if word not in stops] lemmatizer = WordNetLemmatizer() - ready = [lemmatizer.lemmatize(word) for word in filtered] - print(f"Preprocessed {self.origin} to: {ready}") - return ready + return [lemmatizer.lemmatize(word) for word in filtered] diff --git a/models/model/pre/pre_readme.py b/models/model/pre/pre_readme.py index f2504ab..63de12b 100644 --- a/models/model/pre/pre_readme.py +++ b/models/model/pre/pre_readme.py @@ -34,7 +34,7 @@ def __init__(self, content): self.content = content def tokens(self): - lower = self.content.lower() + lower = str(self.content).lower() no_tags = re.sub(r'<.*?>', '', lower) no_puncts = re.sub(r'[^\w\s]', '', no_tags) tokens = word_tokenize(no_puncts) @@ -42,6 +42,4 @@ def tokens(self): stops.update(['b', 'bash']) filtered = [word for word in tokens if word not in stops] lemmatizer = WordNetLemmatizer() - ready = [lemmatizer.lemmatize(word, pos='v') for word in filtered] - print(f"Preprocessed {self.content} to: {ready}") - return ready + return [lemmatizer.lemmatize(word, pos='v') for word in filtered] diff --git a/models/model/pre/pre_topics.py b/models/model/pre/pre_topics.py index 2241b5c..982ef01 100644 --- a/models/model/pre/pre_topics.py +++ b/models/model/pre/pre_topics.py @@ -39,6 +39,4 @@ def tokens(self): stops = set(stopwords.words('english')) split = [topic for topic in split if topic not in stops] lemmatizer = WordNetLemmatizer() - ready = [lemmatizer.lemmatize(topic) for topic in split] - print(f"Preprocessed {self.topics} to {ready}") - return ready + return [lemmatizer.lemmatize(topic) for topic in split] From 0ce617fa2306e72edd4a65c1e9ae73984e480c3f Mon Sep 17 00:00:00 2001 From: h1alexbel Date: Thu, 6 Jun 2024 16:37:09 +0300 Subject: [PATCH 2/5] feat(#153): input_ids from raw text --- models/model/pre/embeddings.py | 48 +++++++---------------- models/model_tests/pre/test_embeddings.py | 25 ++++++++---- 2 files changed, 32 insertions(+), 41 deletions(-) diff --git a/models/model/pre/embeddings.py b/models/model/pre/embeddings.py index 49a4cfd..89493f6 100644 --- a/models/model/pre/embeddings.py +++ b/models/model/pre/embeddings.py @@ -19,7 +19,7 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from transformers import BertTokenizer, BertModel +from transformers import BertTokenizer, BertModel, AutoTokenizer import torch """ @@ -27,39 +27,21 @@ """ +# text -> numerical representations -> vector +# 768, defined by the BERT architecture class Embeddings: - def __init__(self, tokens, length, encoder="bert-base-uncased"): - self.tokens = tokens + def __init__(self, raw, length, tokenizer): + self.raw = raw self.length = length - self.tokenizer = BertTokenizer.from_pretrained(encoder) - self.model = BertModel.from_pretrained(encoder) + self.tokenizer = tokenizer def embed(self): - print(f"Generating embeddings for {self.tokens}") - print(f"Encoder: {self.tokenizer}, {self.model}, output length: {self.length}") - inputs = [] - masks = [] - # @todo #143:30min We generate embeddings for each token instead of the whole unit. - # For now, we generate embeddings for each token. We probably should - # generate embeddings for joined tokens as one unit. In this case we - # can try to replace preprocessing steps with a huggingface tokenizers. - # Let's validate this assumption. - for tokens in self.tokens: - ids = self.tokenizer.encode_plus( - tokens, - add_special_tokens=True, - return_tensors='pt', - padding='max_length', - truncation=True, - max_length=self.length - ) - inputs.append(ids["input_ids"]) - masks.append(ids["attention_mask"]) - inputs = torch.cat(inputs, dim=0) - masks = torch.cat(masks, dim=0) - with torch.no_grad(): - outputs = self.model(inputs, attention_mask=masks) - states = outputs.last_hidden_state - embeddings = states[0].numpy() - print(f"Generated embeddings {embeddings}") - return embeddings + tokens = self.tokenizer.tokenize( + self.raw, + padding=True, + truncation=True, + return_tensors='pt' + ) + ids = self.tokenizer.convert_tokens_to_ids(tokens) + final = self.tokenizer.prepare_for_model(ids) + return final diff --git a/models/model_tests/pre/test_embeddings.py b/models/model_tests/pre/test_embeddings.py index f9fc378..b0894bd 100644 --- a/models/model_tests/pre/test_embeddings.py +++ b/models/model_tests/pre/test_embeddings.py @@ -21,6 +21,8 @@ # SOFTWARE. import unittest +from transformers import AutoTokenizer + from model.pre.embeddings import Embeddings """ @@ -30,14 +32,21 @@ class TestEmbeddings(unittest.TestCase): - def test_generates_embeddings_for_tokens(self): - shape = Embeddings( - ["apache", "kafka", "examples", "learning"], - 4 - ).embed().shape - expected = (4, 768) + def test_generates_embeddings_for_raw_text(self): + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + inputs = Embeddings( + "apache/kafka-learning-examples", 4, tokenizer + ).embed()["input_ids"] + expected = [101, 15895, 1013, 10556, 24316, 2050, 1011, 4083, 1011, 4973, 102] self.assertEqual( - shape, + inputs, expected, - f"received matrix's shape {shape} does not match with expected {expected}" + f"Generated input IDs {inputs} do not match with expected {expected}" + ) + back = tokenizer.decode(inputs) + typed = "[CLS] apache / kafka - learning - examples [SEP]" + self.assertEqual( + back, + typed, + f"Decoded input IDs {back} do not match with expected {typed}" ) From 77b1e97edef56d8ec0f2459cb76055854df13ad8 Mon Sep 17 00:00:00 2001 From: h1alexbel Date: Thu, 6 Jun 2024 17:57:16 +0300 Subject: [PATCH 3/5] feat(#153): 78-sized vector from tokenizer --- models/model/pre/pipeline.py | 33 ++++++++++++------------- models/model_tests/pre/test_pipeline.py | 9 ++++--- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/models/model/pre/pipeline.py b/models/model/pre/pipeline.py index c7c3c10..76844fa 100644 --- a/models/model/pre/pipeline.py +++ b/models/model/pre/pipeline.py @@ -21,10 +21,6 @@ # SOFTWARE. from model.pre.embeddings import Embeddings -from model.pre.pre_description import PreDescription -from model.pre.pre_name import PreName -from model.pre.pre_readme import PreReadme -from model.pre.pre_topics import PreTopics from model.pre.vector import Vector """ @@ -39,25 +35,28 @@ class Pipeline: :param repository Repository to vectorize """ - def __init__(self, repository): + def __init__(self, repository, tokenizer): self.repository = repository + self.tokenizer = tokenizer def apply(self): name = self.repository["name"] print(f"processing {name}") - name = PreName(name).tokens() - readme = PreReadme(self.repository["readme"]).tokens() - description = PreDescription(self.repository["description"]).tokens() - topics = PreTopics(self.repository["topics"]).tokens() - e_name = Embeddings(name, 30).embed() - e_readme = Embeddings(readme, 512).embed() - e_description = Embeddings(description, 100).embed() - e_topics = Embeddings(topics, 100).embed() return Vector( - e_name, - e_readme, - e_description, - e_topics, + Embeddings(name, 30, self.tokenizer).embed()["input_ids"], + Embeddings( + self.repository["readme"], + 512, + self.tokenizer + ).embed()["input_ids"], + Embeddings( + self.repository["description"], + 100, + self.tokenizer + ).embed()["input_ids"], + Embeddings( + self.repository["topics"], 100, self.tokenizer + ).embed()["input_ids"], cpd=self.repository["cpd"], rc=self.repository["rc"], ic=self.repository["ic"] diff --git a/models/model_tests/pre/test_pipeline.py b/models/model_tests/pre/test_pipeline.py index b005f9a..539db20 100644 --- a/models/model_tests/pre/test_pipeline.py +++ b/models/model_tests/pre/test_pipeline.py @@ -22,6 +22,7 @@ import unittest from model.pre.pipeline import Pipeline +from transformers import AutoTokenizer """ Test cases for Pipeline. @@ -51,12 +52,13 @@ def test_vectorizes_repository(self): ``` """, "description": "fakehub description", - "topics": ["rust", "github", "mock-api", "testing"], + "topics": "rust,github,mock-api,testing", "cpd": 5.2, "rc": 0.04, "ic": 0.25 } - vector = Pipeline(repository).apply().tolist() + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + vector = Pipeline(repository, tokenizer).apply().tolist() size = len(vector) cpd = repository["cpd"] has_cpd = cpd in vector @@ -64,7 +66,7 @@ def test_vectorizes_repository(self): has_rc = rc in vector ic = repository["ic"] has_ic = ic in vector - expected = 569859 + expected = 78 self.assertEqual( size, expected, @@ -82,4 +84,3 @@ def test_vectorizes_repository(self): has_ic, f"received vector {vector} does not have IC value: {ic}, but should" ) - From 5c90ff7f1b90ed566a432b81f7709dc063125b82 Mon Sep 17 00:00:00 2001 From: h1alexbel Date: Thu, 6 Jun 2024 17:58:16 +0300 Subject: [PATCH 4/5] feat(#153): no nltk pre --- models/model/pre/pre_description.py | 43 --------------- models/model/pre/pre_name.py | 42 -------------- models/model/pre/pre_readme.py | 45 --------------- models/model/pre/pre_topics.py | 42 -------------- .../model_tests/pre/test_pre_description.py | 41 -------------- models/model_tests/pre/test_pre_name.py | 41 -------------- models/model_tests/pre/test_pre_readme.py | 55 ------------------- models/model_tests/pre/test_pre_topics.py | 40 -------------- 8 files changed, 349 deletions(-) delete mode 100644 models/model/pre/pre_description.py delete mode 100644 models/model/pre/pre_name.py delete mode 100644 models/model/pre/pre_readme.py delete mode 100644 models/model/pre/pre_topics.py delete mode 100644 models/model_tests/pre/test_pre_description.py delete mode 100644 models/model_tests/pre/test_pre_name.py delete mode 100644 models/model_tests/pre/test_pre_readme.py delete mode 100644 models/model_tests/pre/test_pre_topics.py diff --git a/models/model/pre/pre_description.py b/models/model/pre/pre_description.py deleted file mode 100644 index 2bb07a9..0000000 --- a/models/model/pre/pre_description.py +++ /dev/null @@ -1,43 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import re - -from nltk import word_tokenize, WordNetLemmatizer -from nltk.corpus import stopwords - -""" -Repository description preprocessing. -""" - - -class PreDescription: - def __init__(self, text): - self.text = text - - def tokens(self): - lower = self.text.lower() - no_puncts = re.sub(r'[^\w\s]', '', lower) - tokens = word_tokenize(no_puncts) - stops = set(stopwords.words('english')) - filtered = [word for word in tokens if word not in stops] - lemmatizer = WordNetLemmatizer() - return [lemmatizer.lemmatize(word, pos='v') for word in filtered] diff --git a/models/model/pre/pre_name.py b/models/model/pre/pre_name.py deleted file mode 100644 index 4854b07..0000000 --- a/models/model/pre/pre_name.py +++ /dev/null @@ -1,42 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import re - -from nltk.corpus import stopwords -from nltk.stem import WordNetLemmatizer - -""" -Repository name preprocessing. -""" - - -class PreName: - def __init__(self, origin): - self.origin = origin - - def tokens(self): - name = self.origin.lower() - tokens = re.split(r'[/\-_]', name) - stops = set(stopwords.words('english')) - filtered = [word for word in tokens if word not in stops] - lemmatizer = WordNetLemmatizer() - return [lemmatizer.lemmatize(word) for word in filtered] diff --git a/models/model/pre/pre_readme.py b/models/model/pre/pre_readme.py deleted file mode 100644 index 63de12b..0000000 --- a/models/model/pre/pre_readme.py +++ /dev/null @@ -1,45 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import re -from nltk import WordNetLemmatizer -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize - -""" -Repository README preprocessing. -""" - - -class PreReadme: - def __init__(self, content): - self.content = content - - def tokens(self): - lower = str(self.content).lower() - no_tags = re.sub(r'<.*?>', '', lower) - no_puncts = re.sub(r'[^\w\s]', '', no_tags) - tokens = word_tokenize(no_puncts) - stops = set(stopwords.words('english')) - stops.update(['b', 'bash']) - filtered = [word for word in tokens if word not in stops] - lemmatizer = WordNetLemmatizer() - return [lemmatizer.lemmatize(word, pos='v') for word in filtered] diff --git a/models/model/pre/pre_topics.py b/models/model/pre/pre_topics.py deleted file mode 100644 index 982ef01..0000000 --- a/models/model/pre/pre_topics.py +++ /dev/null @@ -1,42 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import re - -from nltk.corpus import stopwords -from nltk.stem import WordNetLemmatizer - -""" -Repository topics preprocessing. -""" - - -class PreTopics: - def __init__(self, topics): - self.topics = topics - - def tokens(self): - lower = [topic.lower() for topic in self.topics] - split = [re.sub(r'[^a-z0-9\s]', '', topic) for topic in lower] - stops = set(stopwords.words('english')) - split = [topic for topic in split if topic not in stops] - lemmatizer = WordNetLemmatizer() - return [lemmatizer.lemmatize(topic) for topic in split] diff --git a/models/model_tests/pre/test_pre_description.py b/models/model_tests/pre/test_pre_description.py deleted file mode 100644 index 2fbe04c..0000000 --- a/models/model_tests/pre/test_pre_description.py +++ /dev/null @@ -1,41 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import unittest - -from model.pre.pre_description import PreDescription - -""" -Test cases for PreDescription. -""" - - -class TestPreDescription(unittest.TestCase): - - def test_preprocess_description(self): - input = "This repository hosts Java examples" - tokens = PreDescription(input).tokens() - expected = ["repository", "host", "java", "examples"] - self.assertEqual( - tokens, - expected, - f"received tokens {tokens} for input: {input} do not match with expected {expected}" - ) diff --git a/models/model_tests/pre/test_pre_name.py b/models/model_tests/pre/test_pre_name.py deleted file mode 100644 index bcdf341..0000000 --- a/models/model_tests/pre/test_pre_name.py +++ /dev/null @@ -1,41 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import unittest - -from model.pre.pre_name import PreName - -""" -Test cases for PreName. -""" - - -class TestPreName(unittest.TestCase): - - def test_preprocesses_name(self): - input = "streaming-with-flink/examples-java" - tokens = PreName(input).tokens() - expected = ["streaming", "flink", "example", "java"] - self.assertEqual( - tokens, - expected, - f"received tokens {tokens} for input: {input} do not match with expected {expected}" - ) diff --git a/models/model_tests/pre/test_pre_readme.py b/models/model_tests/pre/test_pre_readme.py deleted file mode 100644 index 42df9bc..0000000 --- a/models/model_tests/pre/test_pre_readme.py +++ /dev/null @@ -1,55 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import unittest - -from model.pre.pre_readme import PreReadme - -""" -Test cases for PreReadme. -""" - - -class TestPreReadme(unittest.TestCase): - - def test_preprocesses_readme_in_tokens(self): - tokens = PreReadme(""" - ## Java Examples for Stream Processing with Apache Flink - - This repository hosts Java code examples for - ["Stream Processing with Apache Flink"](//link). - - **Note:** The Java examples are not complete yet.
- The [Scala examples](#scala) placed here. - """).tokens() - expected = [ - "java", "examples", "stream", "process", - "apache", "flink", "repository", "host", - "java", "code", "examples", "stream", - "process", "apache", "flinklink", "note", - "java", "examples", "complete", "yet", - "scala", "examplesscala", "place" - ] - self.assertEqual( - tokens, - expected, - f"received tokens {tokens} do not match with expected {expected}" - ) diff --git a/models/model_tests/pre/test_pre_topics.py b/models/model_tests/pre/test_pre_topics.py deleted file mode 100644 index 71a5409..0000000 --- a/models/model_tests/pre/test_pre_topics.py +++ /dev/null @@ -1,40 +0,0 @@ -# The MIT License (MIT) -# -# Copyright (c) 2024 Aliaksei Bialiauski -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import unittest - -from model.pre.pre_topics import PreTopics - -""" -Test cases for PreTopics. -""" - - -class TestPreTopics(unittest.TestCase): - - def test_preprocesses_topics(self): - tokens = PreTopics(["java", "examples", "flink", "streaming"]).tokens() - expected = ["java", "example", "flink", "streaming"] - self.assertEqual( - tokens, - expected, - f"received tokens {tokens} do not match with expected {expected}" - ) From 269346e1036d032aceacfc8459c2da496da62805 Mon Sep 17 00:00:00 2001 From: h1alexbel Date: Thu, 6 Jun 2024 18:23:44 +0300 Subject: [PATCH 5/5] doc(#153): no struct --- models/README.md | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/models/README.md b/models/README.md index ed505bd..acc3e15 100644 --- a/models/README.md +++ b/models/README.md @@ -37,19 +37,9 @@ You will need [Docker] installed. ## How to build new dataset? -Dataset used for model training are located here: -[train.csv](https://github.com/h1alexbel/samples-filter/blob/dataset/train.csv) -To refresh it, run [srdataset] either on cloud VM or locally. The building -process can take a while. After it completed, you should have `dataset.csv` -file with all collected repositories with the following structure: - -* `name`: repository full name, e.g. `redisson/redisson-examples`. -* `readme`: repository README.md file. -* `description`: repository description. -* `topics`: a set of repository topics, e.g. `[apache, streaming, kafka]` -* `CPD`: commits per day calculated metric. -* `RC`: published releases to commits ratio. -* `IC`: issues to commits ratio. +To build a new dataset, run [srdataset] either on cloud VM or locally. The +building process can take a while. After it completed, you should have +`repos.csv` file with all collected repositories. All features must be preprocessed and vectorized using [pipeline.py]. Once you have vectors, you can [feed](#how-to-train-it) them to the models.