Merge pull request #9 from gessi-chatbots/faster-preprocessing

Faster preprocessing
nlp4se · Sep 30, 2024 · b624a1c · b624a1c
2 parents 9d17aaf + 36736b8
commit b624a1c
Show file tree

Hide file tree

Showing 10 changed files with 208 additions and 175 deletions.
diff --git a/.env b/.env
@@ -0,0 +1,2 @@
+DG_SERVICE_URL=http://localhost
+DG_SERVICE_URL=3008
diff --git a/Pipfile b/Pipfile
@@ -13,6 +13,8 @@ transformers = "*"
 contractions = "*"
 spacy = "*"
 pyspellchecker = "*"
+python-dotenv = "*"
+pandas = "*"
 
 [requires]
-python_version = "3.9"
+python_version = "3.9"
diff --git a/backend/Affinity_strategy.py b/backend/Affinity_strategy.py
@@ -9,9 +9,10 @@
 import os
 import joblib
 import torch
+import pandas as pd
 
 MODEL_DIRECTORY_PATH = 'static' + os.path.sep + 'pkls'
-
+MODEL_DIRECTORY_CSV_PATH = 'static' + os.path.sep + 'csv'
 
 class AffinityStrategy():
     @abstractmethod
@@ -56,7 +57,6 @@ def compute_affinity(self, data: List):
 #         file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_PATH, file_name)
 #         joblib.dump(model_info, file_path)
 #         return file_path
-
 class BERTCosineEmbeddingAffinity(AffinityStrategy):
 
     def compute_affinity(self,
@@ -76,31 +76,30 @@ def compute_affinity(self,
         def process_batch(batch_data, batch_index):
             print(f"Processing batch {batch_index + 1}/{(len(data) + batch_size - 1) // batch_size}...")
 
-            # Tokenize and pad sentences in the batch
             tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in batch_data]
             max_len = max(len(sent) for sent in tokenized_sentences)
             padded_sentences = [sent + [tokenizer.pad_token_id] * (max_len - len(sent)) for sent in tokenized_sentences]
             input_ids = torch.tensor(padded_sentences)
 
-            # Get BERT embeddings
             print(f"Getting BERT embeddings for batch {batch_index + 1}...")
             with torch.no_grad():
                 outputs = model(input_ids)
             embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings
 
-            # Apply verb and object weights
             print(f"Applying verb and object weights for batch {batch_index + 1}...")
             tagged_data = [nlp(sent) for sent in batch_data]
+
             for i, doc in enumerate(tagged_data):
                 for token in doc:
-                    if token.pos_ == 'VERB' and verb_weight != 0:
+                    token_position = token.pos_
+                    if token_position == 'VERB' and verb_weight != 0:
                         embeddings[i] += verb_weight * embeddings[i]
-                    elif token.pos_ == 'NOUN' and object_weight != 0:
+                    elif token_position == 'NOUN' and object_weight != 0:
                         embeddings[i] += object_weight * embeddings[i]
 
             return embeddings
 
-        all_embeddings = []  # To store all batches of embeddings
+        all_embeddings = []
         batch_size = 32
 
         print(f"Processing data in batches of size {batch_size}...")
@@ -129,6 +128,19 @@ def process_batch(batch_data, batch_index):
 
         clustering_model.fit(dense_data_array)
 
+        # Get labels from clustering results
+        labels = clustering_model.labels_
+
+        # Create a DataFrame to save the clustering results
+        df_results = pd.DataFrame({'Sentence': data, 'Cluster': labels})
+
+        # Save the DataFrame to a CSV file
+        csv_file_name = f"{application_name}_bert_cosine_{linkage}_results.csv"
+        csv_file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_CSV_PATH, csv_file_name)
+
+        print(f"Saving clustering results to {csv_file_path}...")
+        df_results.to_csv(csv_file_path, index=False)
+
         # Save the clustering model and other information
         print("Saving the clustering model and metadata...")
         model_info = {
@@ -148,7 +160,7 @@ def process_batch(batch_data, batch_index):
         joblib.dump(model_info, file_path)
 
         print("Process completed.")
-        return file_path
+        return csv_file_path, file_path
 
 # class BERTEuclideanEmbeddingAffinity(AffinityStrategy):
 #     def compute_affinity(self, application_name, data: List, linkage, distance_threshold):

diff --git a/backend/dendogram_controller.py b/backend/dendogram_controller.py
@@ -23,7 +23,6 @@ def generate_dendogram():
     if request_content['features'] is None:
         return make_response("No features", 400)
 
-    # TODO we should add the params in the request body, too many
     dendogram_file = dendogram_service.generate_dendogram(preprocessing,
                                                           affinity,
                                                           linkage,

diff --git a/backend/dendogram_service.py b/backend/dendogram_service.py
@@ -1,30 +1,21 @@
-import stanza
-import unicodedata
-import contractions
-import re
-import string
-import spacy
+import requests
 import json
 import os
-from spellchecker import SpellChecker
 from .Context import Context
 from . import Affinity_strategy
+from dotenv import load_dotenv
 
-
+load_dotenv()
 def preprocessed_app(app_name):
     file_path = f"static/preprocessed_jsons/{app_name}Features.json"
-    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
-        return True
-    return False
-
+    return os.path.exists(file_path) and os.path.getsize(file_path) > 0
 
 def save_preprocessed_features(features, app_name):
     file_path = f"static/preprocessed_jsons/{app_name}Features.json"
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     with open(file_path, "w") as json_file:
         json.dump(features, json_file)
 
-
 def load_saved_preprocessed_features(app_name):
     file_path = f"static/preprocessed_jsons/{app_name}Features.json"
     if not os.path.exists(file_path):
@@ -33,7 +24,6 @@ def load_saved_preprocessed_features(app_name):
         return json.load(json_file)
     return None
 
-
 def generate_dendogram(preprocessing,
                        embedding,
                        linkage,
@@ -45,26 +35,12 @@ def generate_dendogram(preprocessing,
     features = request_content['features']
 
     if preprocessing and not preprocessed_app(app_name):
-        features = preprocess_features(features)
+        features = call_preprocessing_service(features)
         save_preprocessed_features(features, app_name)
     elif preprocessing and preprocessed_app(app_name):
         features = load_saved_preprocessed_features(app_name)
 
-
-
-    # if embedding == 'tf-idf-cosine' or embedding == 'all':
-    # context = Context(Affinity_strategy.TfIdfCosineAffinity())
-    # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)
-
-    # if embedding == 'tf-idf-euclidean' or embedding == 'all':
-    # context = Context(Affinity_strategy.TfIdfEuclideanAffinity())
-    # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)
-
-    # if embedding == 'bert-embedding-euclidean' or embedding == 'all':
-    # context = Context(Affinity_strategy.BERTEuclideanEmbeddingAffinity())
-    # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)
-
-    if embedding == 'bert-embedding-cosine' or embedding == 'all':
+    if embedding == 'bert-embedding-cosine':
         context = Context(Affinity_strategy.BERTCosineEmbeddingAffinity())
         return context.use_affinity_algorithm(application_name=app_name,
                                               data=features,
@@ -73,121 +49,26 @@ def generate_dendogram(preprocessing,
                                               verb_weight=verb_weight,
                                               distance_threshold=distance_threshold)
 
-    # if embedding == 'paraphrase-MiniLM-cosine' or embedding == 'all':
-    # context = Context(Affinity_strategy.ParaphraseMiniLMCosineEmbeddingAffinity())
-    # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)
-
-    # if embedding == 'paraphrase-MiniLM-euclidean' or embedding == 'all':
-    # context = Context(Affinity_strategy.ParaphraseMiniLMEuclideanEmbeddingAffinity())
-    # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)
-
-
-
-# TODO preprocess service
-def is_english(text):
-    pattern = re.compile(r'^[a-zA-Z0-9\s.,?!\'"-]+$')
-    return bool(pattern.match(text))
-
-
-def is_emoji_only(text):
-    emoji_pattern = re.compile(
-        "[\U00010000-\U0010FFFF]+",
-        flags=re.UNICODE
-    )
-    return bool(emoji_pattern.fullmatch(text))
-
-
-def contains_weird_characters(text):
-    weird_characters_pattern = re.compile(r'[^a-zA-Z0-9\s.,?!\'"_-]')
-    return bool(weird_characters_pattern.search(text))
-
-
-def preprocess_features(features):
-    preprocessed_features = []
-    for feature in features:
-        if not is_emoji_only(feature) and not contains_weird_characters(feature):
-            preprocessed_feature = preprocess_feature(feature)
-            if is_english(preprocessed_feature):
-                preprocessed_features.append(preprocessed_feature)
-
-    return preprocessed_features
-
-
-def preprocess_feature(feature):
-    feature = feature.replace('_', ' ')
-    feature = remove_mentions_and_tags(feature)
-    # feature = remove_numbers(feature) TODO Check with Quim
-    feature = camel_case_to_words(feature)
-    feature = expand_contractions(feature)
-    feature = remove_special_characters(feature)
-    feature = remove_punctuation(feature)
-    feature = standarize_accents(feature)
-    # feature = spell_check(feature)
-    feature = lemmatize_spacy(feature)
-    # feature = lemmatize_stanza(feature)
-    feature = feature.lower()
-    return feature
-
-
-def expand_contractions(feature):
-    expanded_words = []
-    for word in feature.split():
-        expanded_words.append(contractions.fix(word))
-    return ' '.join(expanded_words)
-
-
-def standarize_accents(feature):
-    return unicodedata.normalize('NFKD', feature).encode('ascii', 'ignore').decode('utf-8', 'ignore')
-
-
-def remove_mentions_and_tags(text):
-    text = re.sub(r'@\S*', '', text)
-    return re.sub(r'#\S*', '', text)
-
-
-def remove_special_characters(text):
-    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
-    return re.sub(pat, '', text)
-
-
-def remove_numbers(text):
-    pattern = r'[^a-zA-z.,!?/:;\"\'\s]'
-    return re.sub(pattern, '', text)
-
-
-def remove_punctuation(text):
-    return ''.join([c for c in text if c not in string.punctuation])
-
-
-def camel_case_to_words(camel_case_str):
-    words = re.sub('([a-z])([A-Z])', r'\1 \2', camel_case_str)
-    return words
-
 
-def lemmatize_spacy(feature):
-    spacy.prefer_gpu()
-    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
-    doc = nlp(feature)
-    return " ".join([token.lemma_ for token in doc])
+def call_preprocessing_service(features):
+    url = os.getenv("DG_SERVICE_URL")
+    port = os.getenv("DG_SERVICE_PORT")
 
+    if not url or not port:
+        raise Exception("Preprocessing service URL or port not found in environment variables.")
 
-def lemmatize_stanza(feature):
-    stanza.download('en')
-    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
-    doc = nlp(feature)
-    lemmatized_feature = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
-    return lemmatized_feature
+    full_url = f"{url}:{port}/preprocess"
 
+    data = {
+        "features": features
+    }
 
-def spell_check(feature):
-    spell = SpellChecker()
-    corrected_feature = []
-    for word in feature.split():
-        corrected_word = spell.correction(word)
-        if corrected_word is not None:
-            corrected_feature.append(corrected_word)
+    try:
+        response = requests.post(full_url, json=data)
+        if response.status_code == 200:
+            return response.json()['preprocessed_features']
         else:
-            corrected_feature.append(word)
-    if corrected_feature is None:
-        return ""
-    return " ".join(corrected_feature)
+            raise Exception(
+                f"Failed to preprocess features. Status code: {response.status_code}, Response: {response.text}")
+    except Exception as e:
+        raise Exception(f"Error occurred while calling preprocessing service: {str(e)}")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		DG_SERVICE_URL=http://localhost
		DG_SERVICE_URL=3008