Skip to content

Commit

Permalink
Merge pull request #9 from gessi-chatbots/faster-preprocessing
Browse files Browse the repository at this point in the history
Faster preprocessing
  • Loading branch information
mtiessler authored Sep 30, 2024
2 parents 9d17aaf + 36736b8 commit b624a1c
Show file tree
Hide file tree
Showing 10 changed files with 208 additions and 175 deletions.
2 changes: 2 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
DG_SERVICE_URL=http://localhost
DG_SERVICE_URL=3008
4 changes: 3 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ transformers = "*"
contractions = "*"
spacy = "*"
pyspellchecker = "*"
python-dotenv = "*"
pandas = "*"

[requires]
python_version = "3.9"
python_version = "3.9"
30 changes: 21 additions & 9 deletions backend/Affinity_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
import os
import joblib
import torch
import pandas as pd

MODEL_DIRECTORY_PATH = 'static' + os.path.sep + 'pkls'

MODEL_DIRECTORY_CSV_PATH = 'static' + os.path.sep + 'csv'

class AffinityStrategy():
@abstractmethod
Expand Down Expand Up @@ -56,7 +57,6 @@ def compute_affinity(self, data: List):
# file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_PATH, file_name)
# joblib.dump(model_info, file_path)
# return file_path

class BERTCosineEmbeddingAffinity(AffinityStrategy):

def compute_affinity(self,
Expand All @@ -76,31 +76,30 @@ def compute_affinity(self,
def process_batch(batch_data, batch_index):
print(f"Processing batch {batch_index + 1}/{(len(data) + batch_size - 1) // batch_size}...")

# Tokenize and pad sentences in the batch
tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in batch_data]
max_len = max(len(sent) for sent in tokenized_sentences)
padded_sentences = [sent + [tokenizer.pad_token_id] * (max_len - len(sent)) for sent in tokenized_sentences]
input_ids = torch.tensor(padded_sentences)

# Get BERT embeddings
print(f"Getting BERT embeddings for batch {batch_index + 1}...")
with torch.no_grad():
outputs = model(input_ids)
embeddings = outputs.last_hidden_state[:, 0, :] # CLS token embeddings

# Apply verb and object weights
print(f"Applying verb and object weights for batch {batch_index + 1}...")
tagged_data = [nlp(sent) for sent in batch_data]

for i, doc in enumerate(tagged_data):
for token in doc:
if token.pos_ == 'VERB' and verb_weight != 0:
token_position = token.pos_
if token_position == 'VERB' and verb_weight != 0:
embeddings[i] += verb_weight * embeddings[i]
elif token.pos_ == 'NOUN' and object_weight != 0:
elif token_position == 'NOUN' and object_weight != 0:
embeddings[i] += object_weight * embeddings[i]

return embeddings

all_embeddings = [] # To store all batches of embeddings
all_embeddings = []
batch_size = 32

print(f"Processing data in batches of size {batch_size}...")
Expand Down Expand Up @@ -129,6 +128,19 @@ def process_batch(batch_data, batch_index):

clustering_model.fit(dense_data_array)

# Get labels from clustering results
labels = clustering_model.labels_

# Create a DataFrame to save the clustering results
df_results = pd.DataFrame({'Sentence': data, 'Cluster': labels})

# Save the DataFrame to a CSV file
csv_file_name = f"{application_name}_bert_cosine_{linkage}_results.csv"
csv_file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_CSV_PATH, csv_file_name)

print(f"Saving clustering results to {csv_file_path}...")
df_results.to_csv(csv_file_path, index=False)

# Save the clustering model and other information
print("Saving the clustering model and metadata...")
model_info = {
Expand All @@ -148,7 +160,7 @@ def process_batch(batch_data, batch_index):
joblib.dump(model_info, file_path)

print("Process completed.")
return file_path
return csv_file_path, file_path

# class BERTEuclideanEmbeddingAffinity(AffinityStrategy):
# def compute_affinity(self, application_name, data: List, linkage, distance_threshold):
Expand Down
1 change: 0 additions & 1 deletion backend/dendogram_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def generate_dendogram():
if request_content['features'] is None:
return make_response("No features", 400)

# TODO we should add the params in the request body, too many
dendogram_file = dendogram_service.generate_dendogram(preprocessing,
affinity,
linkage,
Expand Down
165 changes: 23 additions & 142 deletions backend/dendogram_service.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,21 @@
import stanza
import unicodedata
import contractions
import re
import string
import spacy
import requests
import json
import os
from spellchecker import SpellChecker
from .Context import Context
from . import Affinity_strategy
from dotenv import load_dotenv


load_dotenv()
def preprocessed_app(app_name):
file_path = f"static/preprocessed_jsons/{app_name}Features.json"
if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
return True
return False

return os.path.exists(file_path) and os.path.getsize(file_path) > 0

def save_preprocessed_features(features, app_name):
file_path = f"static/preprocessed_jsons/{app_name}Features.json"
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "w") as json_file:
json.dump(features, json_file)


def load_saved_preprocessed_features(app_name):
file_path = f"static/preprocessed_jsons/{app_name}Features.json"
if not os.path.exists(file_path):
Expand All @@ -33,7 +24,6 @@ def load_saved_preprocessed_features(app_name):
return json.load(json_file)
return None


def generate_dendogram(preprocessing,
embedding,
linkage,
Expand All @@ -45,26 +35,12 @@ def generate_dendogram(preprocessing,
features = request_content['features']

if preprocessing and not preprocessed_app(app_name):
features = preprocess_features(features)
features = call_preprocessing_service(features)
save_preprocessed_features(features, app_name)
elif preprocessing and preprocessed_app(app_name):
features = load_saved_preprocessed_features(app_name)



# if embedding == 'tf-idf-cosine' or embedding == 'all':
# context = Context(Affinity_strategy.TfIdfCosineAffinity())
# model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)

# if embedding == 'tf-idf-euclidean' or embedding == 'all':
# context = Context(Affinity_strategy.TfIdfEuclideanAffinity())
# model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)

# if embedding == 'bert-embedding-euclidean' or embedding == 'all':
# context = Context(Affinity_strategy.BERTEuclideanEmbeddingAffinity())
# model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)

if embedding == 'bert-embedding-cosine' or embedding == 'all':
if embedding == 'bert-embedding-cosine':
context = Context(Affinity_strategy.BERTCosineEmbeddingAffinity())
return context.use_affinity_algorithm(application_name=app_name,
data=features,
Expand All @@ -73,121 +49,26 @@ def generate_dendogram(preprocessing,
verb_weight=verb_weight,
distance_threshold=distance_threshold)

# if embedding == 'paraphrase-MiniLM-cosine' or embedding == 'all':
# context = Context(Affinity_strategy.ParaphraseMiniLMCosineEmbeddingAffinity())
# model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)

# if embedding == 'paraphrase-MiniLM-euclidean' or embedding == 'all':
# context = Context(Affinity_strategy.ParaphraseMiniLMEuclideanEmbeddingAffinity())
# model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold)



# TODO preprocess service
def is_english(text):
pattern = re.compile(r'^[a-zA-Z0-9\s.,?!\'"-]+$')
return bool(pattern.match(text))


def is_emoji_only(text):
emoji_pattern = re.compile(
"[\U00010000-\U0010FFFF]+",
flags=re.UNICODE
)
return bool(emoji_pattern.fullmatch(text))


def contains_weird_characters(text):
weird_characters_pattern = re.compile(r'[^a-zA-Z0-9\s.,?!\'"_-]')
return bool(weird_characters_pattern.search(text))


def preprocess_features(features):
preprocessed_features = []
for feature in features:
if not is_emoji_only(feature) and not contains_weird_characters(feature):
preprocessed_feature = preprocess_feature(feature)
if is_english(preprocessed_feature):
preprocessed_features.append(preprocessed_feature)

return preprocessed_features


def preprocess_feature(feature):
feature = feature.replace('_', ' ')
feature = remove_mentions_and_tags(feature)
# feature = remove_numbers(feature) TODO Check with Quim
feature = camel_case_to_words(feature)
feature = expand_contractions(feature)
feature = remove_special_characters(feature)
feature = remove_punctuation(feature)
feature = standarize_accents(feature)
# feature = spell_check(feature)
feature = lemmatize_spacy(feature)
# feature = lemmatize_stanza(feature)
feature = feature.lower()
return feature


def expand_contractions(feature):
expanded_words = []
for word in feature.split():
expanded_words.append(contractions.fix(word))
return ' '.join(expanded_words)


def standarize_accents(feature):
return unicodedata.normalize('NFKD', feature).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def remove_mentions_and_tags(text):
text = re.sub(r'@\S*', '', text)
return re.sub(r'#\S*', '', text)


def remove_special_characters(text):
pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]'
return re.sub(pat, '', text)


def remove_numbers(text):
pattern = r'[^a-zA-z.,!?/:;\"\'\s]'
return re.sub(pattern, '', text)


def remove_punctuation(text):
return ''.join([c for c in text if c not in string.punctuation])


def camel_case_to_words(camel_case_str):
words = re.sub('([a-z])([A-Z])', r'\1 \2', camel_case_str)
return words


def lemmatize_spacy(feature):
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
doc = nlp(feature)
return " ".join([token.lemma_ for token in doc])
def call_preprocessing_service(features):
url = os.getenv("DG_SERVICE_URL")
port = os.getenv("DG_SERVICE_PORT")

if not url or not port:
raise Exception("Preprocessing service URL or port not found in environment variables.")

def lemmatize_stanza(feature):
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
doc = nlp(feature)
lemmatized_feature = ' '.join([word.lemma for sent in doc.sentences for word in sent.words])
return lemmatized_feature
full_url = f"{url}:{port}/preprocess"

data = {
"features": features
}

def spell_check(feature):
spell = SpellChecker()
corrected_feature = []
for word in feature.split():
corrected_word = spell.correction(word)
if corrected_word is not None:
corrected_feature.append(corrected_word)
try:
response = requests.post(full_url, json=data)
if response.status_code == 200:
return response.json()['preprocessed_features']
else:
corrected_feature.append(word)
if corrected_feature is None:
return ""
return " ".join(corrected_feature)
raise Exception(
f"Failed to preprocess features. Status code: {response.status_code}, Response: {response.text}")
except Exception as e:
raise Exception(f"Error occurred while calling preprocessing service: {str(e)}")
Loading

0 comments on commit b624a1c

Please sign in to comment.