Skip to content

Commit

Permalink
Merge branch '__rultor'
Browse files Browse the repository at this point in the history
  • Loading branch information
rultor committed Jun 6, 2024
2 parents a52f509 + 269346e commit ce612fd
Show file tree
Hide file tree
Showing 13 changed files with 56 additions and 390 deletions.
16 changes: 3 additions & 13 deletions models/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,9 @@ You will need [Docker] installed.

## How to build new dataset?

Dataset used for model training are located here:
[train.csv](https://github.com/h1alexbel/samples-filter/blob/dataset/train.csv)
To refresh it, run [srdataset] either on cloud VM or locally. The building
process can take a while. After it completed, you should have `dataset.csv`
file with all collected repositories with the following structure:

* `name`: repository full name, e.g. `redisson/redisson-examples`.
* `readme`: repository README.md file.
* `description`: repository description.
* `topics`: a set of repository topics, e.g. `[apache, streaming, kafka]`
* `CPD`: commits per day calculated metric.
* `RC`: published releases to commits ratio.
* `IC`: issues to commits ratio.
To build a new dataset, run [srdataset] either on cloud VM or locally. The
building process can take a while. After it completed, you should have
`repos.csv` file with all collected repositories.

All features must be preprocessed and vectorized using [pipeline.py].
Once you have vectors, you can [feed](#how-to-train-it) them to the models.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,3 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import unittest

from model.pre.pre_topics import PreTopics

"""
Test cases for PreTopics.
"""


class TestPreTopics(unittest.TestCase):

def test_preprocesses_topics(self):
tokens = PreTopics(["java", "examples", "flink", "streaming"]).tokens()
expected = ["java", "example", "flink", "streaming"]
self.assertEqual(
tokens,
expected,
f"received tokens {tokens} do not match with expected {expected}"
)
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,3 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import unittest

from model.pre.pre_name import PreName

"""
Test cases for PreName.
"""


class TestPreName(unittest.TestCase):

def test_preprocesses_name(self):
input = "streaming-with-flink/examples-java"
tokens = PreName(input).tokens()
expected = ["streaming", "flink", "example", "java"]
self.assertEqual(
tokens,
expected,
f"received tokens {tokens} for input: {input} do not match with expected {expected}"
)
48 changes: 15 additions & 33 deletions models/model/pre/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,47 +19,29 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from transformers import BertTokenizer, BertModel
from transformers import BertTokenizer, BertModel, AutoTokenizer
import torch

"""
Generate embeddings for a set of tokens.
"""


# text -> numerical representations -> vector
# 768, defined by the BERT architecture
class Embeddings:
def __init__(self, tokens, length, encoder="bert-base-uncased"):
self.tokens = tokens
def __init__(self, raw, length, tokenizer):
self.raw = raw
self.length = length
self.tokenizer = BertTokenizer.from_pretrained(encoder)
self.model = BertModel.from_pretrained(encoder)
self.tokenizer = tokenizer

def embed(self):
print(f"Generating embeddings for {self.tokens}")
print(f"Encoder: {self.tokenizer}, {self.model}, output length: {self.length}")
inputs = []
masks = []
# @todo #143:30min We generate embeddings for each token instead of the whole unit.
# For now, we generate embeddings for each token. We probably should
# generate embeddings for joined tokens as one unit. In this case we
# can try to replace preprocessing steps with a huggingface tokenizers.
# Let's validate this assumption.
for tokens in self.tokens:
ids = self.tokenizer.encode_plus(
tokens,
add_special_tokens=True,
return_tensors='pt',
padding='max_length',
truncation=True,
max_length=self.length
)
inputs.append(ids["input_ids"])
masks.append(ids["attention_mask"])
inputs = torch.cat(inputs, dim=0)
masks = torch.cat(masks, dim=0)
with torch.no_grad():
outputs = self.model(inputs, attention_mask=masks)
states = outputs.last_hidden_state
embeddings = states[0].numpy()
print(f"Generated embeddings {embeddings}")
return embeddings
tokens = self.tokenizer.tokenize(
self.raw,
padding=True,
truncation=True,
return_tensors='pt'
)
ids = self.tokenizer.convert_tokens_to_ids(tokens)
final = self.tokenizer.prepare_for_model(ids)
return final
33 changes: 16 additions & 17 deletions models/model/pre/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,6 @@
# SOFTWARE.

from model.pre.embeddings import Embeddings
from model.pre.pre_description import PreDescription
from model.pre.pre_name import PreName
from model.pre.pre_readme import PreReadme
from model.pre.pre_topics import PreTopics
from model.pre.vector import Vector

"""
Expand All @@ -39,25 +35,28 @@ class Pipeline:
:param repository Repository to vectorize
"""

def __init__(self, repository):
def __init__(self, repository, tokenizer):
self.repository = repository
self.tokenizer = tokenizer

def apply(self):
name = self.repository["name"]
print(f"processing {name}")
name = PreName(name).tokens()
readme = PreReadme(self.repository["readme"]).tokens()
description = PreDescription(self.repository["description"]).tokens()
topics = PreTopics(self.repository["topics"]).tokens()
e_name = Embeddings(name, 30).embed()
e_readme = Embeddings(readme, 512).embed()
e_description = Embeddings(description, 100).embed()
e_topics = Embeddings(topics, 100).embed()
return Vector(
e_name,
e_readme,
e_description,
e_topics,
Embeddings(name, 30, self.tokenizer).embed()["input_ids"],
Embeddings(
self.repository["readme"],
512,
self.tokenizer
).embed()["input_ids"],
Embeddings(
self.repository["description"],
100,
self.tokenizer
).embed()["input_ids"],
Embeddings(
self.repository["topics"], 100, self.tokenizer
).embed()["input_ids"],
cpd=self.repository["cpd"],
rc=self.repository["rc"],
ic=self.repository["ic"]
Expand Down
45 changes: 0 additions & 45 deletions models/model/pre/pre_description.py

This file was deleted.

44 changes: 0 additions & 44 deletions models/model/pre/pre_name.py

This file was deleted.

47 changes: 0 additions & 47 deletions models/model/pre/pre_readme.py

This file was deleted.

44 changes: 0 additions & 44 deletions models/model/pre/pre_topics.py

This file was deleted.

Loading

1 comment on commit ce612fd

@0pdd
Copy link
Collaborator

@0pdd 0pdd commented on ce612fd Jun 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Puzzle 143-2cf5479a disappeared from models/model/pre/embeddings.py), that's why I closed #153. Please, remember that the puzzle was not necessarily removed in this particular commit. Maybe it happened earlier, but we discovered this fact only now.

Please sign in to comment.