diff --git a/.env b/.env new file mode 100644 index 00000000..9584a6ce --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +DG_SERVICE_URL=http://localhost +DG_SERVICE_URL=3008 diff --git a/Pipfile b/Pipfile index 9d405f6f..a6e4ecaa 100644 --- a/Pipfile +++ b/Pipfile @@ -13,6 +13,8 @@ transformers = "*" contractions = "*" spacy = "*" pyspellchecker = "*" +python-dotenv = "*" +pandas = "*" [requires] -python_version = "3.9" \ No newline at end of file +python_version = "3.9" diff --git a/backend/Affinity_strategy.py b/backend/Affinity_strategy.py index ac12d05e..20bcdca5 100644 --- a/backend/Affinity_strategy.py +++ b/backend/Affinity_strategy.py @@ -9,9 +9,10 @@ import os import joblib import torch +import pandas as pd MODEL_DIRECTORY_PATH = 'static' + os.path.sep + 'pkls' - +MODEL_DIRECTORY_CSV_PATH = 'static' + os.path.sep + 'csv' class AffinityStrategy(): @abstractmethod @@ -56,7 +57,6 @@ def compute_affinity(self, data: List): # file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_PATH, file_name) # joblib.dump(model_info, file_path) # return file_path - class BERTCosineEmbeddingAffinity(AffinityStrategy): def compute_affinity(self, @@ -76,31 +76,30 @@ def compute_affinity(self, def process_batch(batch_data, batch_index): print(f"Processing batch {batch_index + 1}/{(len(data) + batch_size - 1) // batch_size}...") - # Tokenize and pad sentences in the batch tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in batch_data] max_len = max(len(sent) for sent in tokenized_sentences) padded_sentences = [sent + [tokenizer.pad_token_id] * (max_len - len(sent)) for sent in tokenized_sentences] input_ids = torch.tensor(padded_sentences) - # Get BERT embeddings print(f"Getting BERT embeddings for batch {batch_index + 1}...") with torch.no_grad(): outputs = model(input_ids) embeddings = outputs.last_hidden_state[:, 0, :] # CLS token embeddings - # Apply verb and object weights print(f"Applying verb and object weights for batch {batch_index + 1}...") tagged_data = [nlp(sent) for sent in batch_data] + for i, doc in enumerate(tagged_data): for token in doc: - if token.pos_ == 'VERB' and verb_weight != 0: + token_position = token.pos_ + if token_position == 'VERB' and verb_weight != 0: embeddings[i] += verb_weight * embeddings[i] - elif token.pos_ == 'NOUN' and object_weight != 0: + elif token_position == 'NOUN' and object_weight != 0: embeddings[i] += object_weight * embeddings[i] return embeddings - all_embeddings = [] # To store all batches of embeddings + all_embeddings = [] batch_size = 32 print(f"Processing data in batches of size {batch_size}...") @@ -129,6 +128,19 @@ def process_batch(batch_data, batch_index): clustering_model.fit(dense_data_array) + # Get labels from clustering results + labels = clustering_model.labels_ + + # Create a DataFrame to save the clustering results + df_results = pd.DataFrame({'Sentence': data, 'Cluster': labels}) + + # Save the DataFrame to a CSV file + csv_file_name = f"{application_name}_bert_cosine_{linkage}_results.csv" + csv_file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_CSV_PATH, csv_file_name) + + print(f"Saving clustering results to {csv_file_path}...") + df_results.to_csv(csv_file_path, index=False) + # Save the clustering model and other information print("Saving the clustering model and metadata...") model_info = { @@ -148,7 +160,7 @@ def process_batch(batch_data, batch_index): joblib.dump(model_info, file_path) print("Process completed.") - return file_path + return csv_file_path, file_path # class BERTEuclideanEmbeddingAffinity(AffinityStrategy): # def compute_affinity(self, application_name, data: List, linkage, distance_threshold): diff --git a/backend/dendogram_controller.py b/backend/dendogram_controller.py index 80f96a3b..e53530e8 100644 --- a/backend/dendogram_controller.py +++ b/backend/dendogram_controller.py @@ -23,7 +23,6 @@ def generate_dendogram(): if request_content['features'] is None: return make_response("No features", 400) - # TODO we should add the params in the request body, too many dendogram_file = dendogram_service.generate_dendogram(preprocessing, affinity, linkage, diff --git a/backend/dendogram_service.py b/backend/dendogram_service.py index bb6b8b38..64e6a243 100644 --- a/backend/dendogram_service.py +++ b/backend/dendogram_service.py @@ -1,22 +1,14 @@ -import stanza -import unicodedata -import contractions -import re -import string -import spacy +import requests import json import os -from spellchecker import SpellChecker from .Context import Context from . import Affinity_strategy +from dotenv import load_dotenv - +load_dotenv() def preprocessed_app(app_name): file_path = f"static/preprocessed_jsons/{app_name}Features.json" - if os.path.exists(file_path) and os.path.getsize(file_path) > 0: - return True - return False - + return os.path.exists(file_path) and os.path.getsize(file_path) > 0 def save_preprocessed_features(features, app_name): file_path = f"static/preprocessed_jsons/{app_name}Features.json" @@ -24,7 +16,6 @@ def save_preprocessed_features(features, app_name): with open(file_path, "w") as json_file: json.dump(features, json_file) - def load_saved_preprocessed_features(app_name): file_path = f"static/preprocessed_jsons/{app_name}Features.json" if not os.path.exists(file_path): @@ -33,7 +24,6 @@ def load_saved_preprocessed_features(app_name): return json.load(json_file) return None - def generate_dendogram(preprocessing, embedding, linkage, @@ -45,26 +35,12 @@ def generate_dendogram(preprocessing, features = request_content['features'] if preprocessing and not preprocessed_app(app_name): - features = preprocess_features(features) + features = call_preprocessing_service(features) save_preprocessed_features(features, app_name) elif preprocessing and preprocessed_app(app_name): features = load_saved_preprocessed_features(app_name) - - - # if embedding == 'tf-idf-cosine' or embedding == 'all': - # context = Context(Affinity_strategy.TfIdfCosineAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - # if embedding == 'tf-idf-euclidean' or embedding == 'all': - # context = Context(Affinity_strategy.TfIdfEuclideanAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - # if embedding == 'bert-embedding-euclidean' or embedding == 'all': - # context = Context(Affinity_strategy.BERTEuclideanEmbeddingAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - if embedding == 'bert-embedding-cosine' or embedding == 'all': + if embedding == 'bert-embedding-cosine': context = Context(Affinity_strategy.BERTCosineEmbeddingAffinity()) return context.use_affinity_algorithm(application_name=app_name, data=features, @@ -73,121 +49,26 @@ def generate_dendogram(preprocessing, verb_weight=verb_weight, distance_threshold=distance_threshold) - # if embedding == 'paraphrase-MiniLM-cosine' or embedding == 'all': - # context = Context(Affinity_strategy.ParaphraseMiniLMCosineEmbeddingAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - # if embedding == 'paraphrase-MiniLM-euclidean' or embedding == 'all': - # context = Context(Affinity_strategy.ParaphraseMiniLMEuclideanEmbeddingAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - - -# TODO preprocess service -def is_english(text): - pattern = re.compile(r'^[a-zA-Z0-9\s.,?!\'"-]+$') - return bool(pattern.match(text)) - - -def is_emoji_only(text): - emoji_pattern = re.compile( - "[\U00010000-\U0010FFFF]+", - flags=re.UNICODE - ) - return bool(emoji_pattern.fullmatch(text)) - - -def contains_weird_characters(text): - weird_characters_pattern = re.compile(r'[^a-zA-Z0-9\s.,?!\'"_-]') - return bool(weird_characters_pattern.search(text)) - - -def preprocess_features(features): - preprocessed_features = [] - for feature in features: - if not is_emoji_only(feature) and not contains_weird_characters(feature): - preprocessed_feature = preprocess_feature(feature) - if is_english(preprocessed_feature): - preprocessed_features.append(preprocessed_feature) - - return preprocessed_features - - -def preprocess_feature(feature): - feature = feature.replace('_', ' ') - feature = remove_mentions_and_tags(feature) - # feature = remove_numbers(feature) TODO Check with Quim - feature = camel_case_to_words(feature) - feature = expand_contractions(feature) - feature = remove_special_characters(feature) - feature = remove_punctuation(feature) - feature = standarize_accents(feature) - # feature = spell_check(feature) - feature = lemmatize_spacy(feature) - # feature = lemmatize_stanza(feature) - feature = feature.lower() - return feature - - -def expand_contractions(feature): - expanded_words = [] - for word in feature.split(): - expanded_words.append(contractions.fix(word)) - return ' '.join(expanded_words) - - -def standarize_accents(feature): - return unicodedata.normalize('NFKD', feature).encode('ascii', 'ignore').decode('utf-8', 'ignore') - - -def remove_mentions_and_tags(text): - text = re.sub(r'@\S*', '', text) - return re.sub(r'#\S*', '', text) - - -def remove_special_characters(text): - pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' - return re.sub(pat, '', text) - - -def remove_numbers(text): - pattern = r'[^a-zA-z.,!?/:;\"\'\s]' - return re.sub(pattern, '', text) - - -def remove_punctuation(text): - return ''.join([c for c in text if c not in string.punctuation]) - - -def camel_case_to_words(camel_case_str): - words = re.sub('([a-z])([A-Z])', r'\1 \2', camel_case_str) - return words - -def lemmatize_spacy(feature): - spacy.prefer_gpu() - nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) - doc = nlp(feature) - return " ".join([token.lemma_ for token in doc]) +def call_preprocessing_service(features): + url = os.getenv("DG_SERVICE_URL") + port = os.getenv("DG_SERVICE_PORT") + if not url or not port: + raise Exception("Preprocessing service URL or port not found in environment variables.") -def lemmatize_stanza(feature): - stanza.download('en') - nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma') - doc = nlp(feature) - lemmatized_feature = ' '.join([word.lemma for sent in doc.sentences for word in sent.words]) - return lemmatized_feature + full_url = f"{url}:{port}/preprocess" + data = { + "features": features + } -def spell_check(feature): - spell = SpellChecker() - corrected_feature = [] - for word in feature.split(): - corrected_word = spell.correction(word) - if corrected_word is not None: - corrected_feature.append(corrected_word) + try: + response = requests.post(full_url, json=data) + if response.status_code == 200: + return response.json()['preprocessed_features'] else: - corrected_feature.append(word) - if corrected_feature is None: - return "" - return " ".join(corrected_feature) + raise Exception( + f"Failed to preprocess features. Status code: {response.status_code}, Response: {response.text}") + except Exception as e: + raise Exception(f"Error occurred while calling preprocessing service: {str(e)}") \ No newline at end of file diff --git a/backend/preprocessing_service.py b/backend/preprocessing_service.py new file mode 100644 index 00000000..c1ae836b --- /dev/null +++ b/backend/preprocessing_service.py @@ -0,0 +1,118 @@ +# preprocessing_service.py + +from flask import Flask, request, jsonify +import os +import json +from spellchecker import SpellChecker +import stanza +import unicodedata +import contractions +import re +import string +import spacy + +app = Flask(__name__) + +@app.route('/preprocess', methods=['POST']) +def preprocess(): + try: + request_data = request.get_json() + app_name = request_data['app_name'] + features = request_data['features'] + + if 'preprocessing' in request_data and request_data['preprocessing']: + # Preprocess features + features = preprocess_features(features) + save_preprocessed_features(features, app_name) + else: + features = load_saved_preprocessed_features(app_name) + + return jsonify({"preprocessed_features": features}), 200 + + except Exception as e: + return jsonify({"error": str(e)}), 500 + +def preprocessed_app(app_name): + file_path = f"static/preprocessed_jsons/{app_name}Features.json" + return os.path.exists(file_path) and os.path.getsize(file_path) > 0 + +def save_preprocessed_features(features, app_name): + file_path = f"static/preprocessed_jsons/{app_name}Features.json" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w") as json_file: + json.dump(features, json_file) + +def load_saved_preprocessed_features(app_name): + file_path = f"static/preprocessed_jsons/{app_name}Features.json" + if os.path.exists(file_path): + with open(file_path, "r") as json_file: + return json.load(json_file) + return None + +def preprocess_features(features): + preprocessed_features = [] + for feature in features: + if not is_emoji_only(feature) and not contains_weird_characters(feature): + preprocessed_feature = preprocess_feature(feature) + if is_english(preprocessed_feature): + preprocessed_features.append(preprocessed_feature) + + return preprocessed_features + +def preprocess_feature(feature): + feature = feature.replace('_', ' ') + feature = remove_mentions_and_tags(feature) + feature = camel_case_to_words(feature) + feature = expand_contractions(feature) + feature = remove_special_characters(feature) + feature = remove_punctuation(feature) + feature = standarize_accents(feature) + feature = lemmatize_spacy(feature) + feature = feature.lower() + return feature + +# Utility functions (these are the same as from your original code) +def expand_contractions(feature): + expanded_words = [] + for word in feature.split(): + expanded_words.append(contractions.fix(word)) + return ' '.join(expanded_words) + +def standarize_accents(feature): + return unicodedata.normalize('NFKD', feature).encode('ascii', 'ignore').decode('utf-8', 'ignore') + +def remove_mentions_and_tags(text): + text = re.sub(r'@\S*', '', text) + return re.sub(r'#\S*', '', text) + +def remove_special_characters(text): + pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' + return re.sub(pat, '', text) + +def remove_punctuation(text): + return ''.join([c for c in text if c not in string.punctuation]) + +def camel_case_to_words(camel_case_str): + words = re.sub('([a-z])([A-Z])', r'\1 \2', camel_case_str) + return words + +def lemmatize_spacy(feature): + spacy.prefer_gpu() + nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) + doc = nlp(feature) + return " ".join([token.lemma_ for token in doc]) + +def is_english(text): + pattern = re.compile(r'^[a-zA-Z0-9\s.,?!\'"-]+$') + return bool(pattern.match(text)) + +def is_emoji_only(text): + emoji_pattern = re.compile("[\U00010000-\U0010FFFF]+", flags=re.UNICODE) + return bool(emoji_pattern.fullmatch(text)) + +def contains_weird_characters(text): + weird_characters_pattern = re.compile(r'[^a-zA-Z0-9\s.,?!\'"_-]') + return bool(weird_characters_pattern.search(text)) + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=5000) diff --git a/client/visualizator.py b/client/visualizator.py index 986ac05f..962d0787 100644 --- a/client/visualizator.py +++ b/client/visualizator.py @@ -3,8 +3,11 @@ from scipy.cluster.hierarchy import dendrogram import joblib import argparse +import os CLUSTER_COLOR_THRESHOLD = 0.08 + + def add_line_breaks(labels): return [label.replace(' ', '\n') for label in labels] @@ -25,18 +28,14 @@ def plot_dendrogram(model, labels, **kwargs): [model.children_, model.distances_, counts] ).astype(float) labels = add_line_breaks(labels=labels) + dendrogram(linkage_matrix, labels=labels, color_threshold=CLUSTER_COLOR_THRESHOLD, - ** kwargs) - ''' - for i, d, c in zip(linkage_matrix[:, 0], linkage_matrix[:, 1], linkage_matrix[:, 2]): - x = 0 - y = c - plt.annotate('%.2f' % c, (x, y), xytext=(550, 0), - textcoords='offset points', va='top', ha='center') - ''' - plt.xticks(rotation=90, fontsize=10) + leaf_font_size=10, + **kwargs) + + plt.xticks(rotation=90, fontsize=10, ha='right') def show_dendrogram(model_file): @@ -46,35 +45,54 @@ def show_dendrogram(model_file): model = file['model'] affinity = file['affinity'] labels = file['labels'] + try: verb_weight = file['verb_weight'] except KeyError: verb_weight = 'N/A' - try: + + try: object_weight = file['object_weight'] except KeyError: object_weight = 'N/A' if hasattr(model, 'children_'): - plt.figure(figsize=(15, 8)) + n_leaves = len(labels) + + max_figsize_width = 30 + max_figsize_height = 15 + figsize_width = min(max_figsize_width, n_leaves * 0.5) + figsize_height = max(12, min(max_figsize_height, n_leaves * 0.25)) + + plt.figure(figsize=(figsize_width, figsize_height)) + plot_dendrogram(model, labels=labels) + plt.title(application_name + ' | ' + affinity + ' | Distance Threshold: ' + str(distance_threshold) + ' | Verb Weight: ' + str(verb_weight) - + ' | Object weight: ' + str(object_weight)) - plt.xlabel('Features', fontsize=12) - plt.ylabel('Distance', fontsize=12) - plt.xticks(rotation=90, fontsize=8) + + ' | Object weight: ' + str(object_weight), + fontsize=14) + plt.xlabel('Features', fontsize=14) + plt.ylabel('Distance', fontsize=14) + plt.subplots_adjust(bottom=0.2) plt.tight_layout() - plt.show() + + save_directory = r"C:\Users\Max\NLP4RE\Dendogram-Generator\static\png" + base_name = os.path.splitext(os.path.basename(model_file))[0] + save_path = os.path.join(save_directory, f"{base_name}.png") + plt.savefig(save_path) + plt.close() + print(f"Dendrogram saved at: {save_path}") else: raise ValueError("The provided model is not AgglomerativeClustering.") - if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Visualize dendrogram from a model file.") - parser.add_argument("model_file", help="Path to the model file") - args = parser.parse_args() + pkls_directory = r"C:\Users\Max\NLP4RE\Dendogram-Generator\static\pkls" - show_dendrogram(args.model_file) + for filename in os.listdir(pkls_directory): + if filename.endswith('.pkl'): + model_file = os.path.join(pkls_directory, filename) + print(f"Processing: {model_file}") + show_dendrogram(model_file) \ No newline at end of file diff --git a/static/feature_preprocessings/Readme.md b/static/feature_preprocessings/Readme.md deleted file mode 100644 index 34f5454d..00000000 --- a/static/feature_preprocessings/Readme.md +++ /dev/null @@ -1 +0,0 @@ -Feature preprocessing files \ No newline at end of file diff --git a/static/preprocessed_jsons/viber.voipFeatures.json b/static/preprocessed_jsons/viber.voipFeatures.json new file mode 100644 index 00000000..b03bfa37 --- /dev/null +++ b/static/preprocessed_jsons/viber.voipFeatures.json @@ -0,0 +1 @@ +["really poor performance", "nice satisfactory", "change rating", "drop call", "lifeline", "easy contact", "face problem", "ek baar bnd krte", "great o", "need more update", "dilip", "add option", "bad whatsapp", "grand app", "very low performance", "cf p", "make video call", "activate phone number", "very gud", "add folder feature", "viber", "restore datum", "have fold", "excellence"] \ No newline at end of file diff --git a/static/preprocessed_jsons/whatsappFeatures.json b/static/preprocessed_jsons/whatsappFeatures.json new file mode 100644 index 00000000..d90db439 --- /dev/null +++ b/static/preprocessed_jsons/whatsappFeatures.json @@ -0,0 +1 @@ +["good app", "nice app", "absolute bias", "love app", "whatsapp", "bad app", "like app", "very nice app", "great app", "good", "love", "op", "use app", "very good app", "access whatsapp", "send code", "add feature", "app", "face issue", "love whatsapp", "new update", "update whatsapp", "download app", "good application", "good experience", "have problem", "open whatsapp", "remove channel", "send photo", "thank", "use whatsapp", "message app", "bad update", "ban account", "face problem", "freedom", "get verification code", "give star", "good work", "great experience", "hate new update", "open app", "supar", "very bad update", "very nice", "add option", "improve platform", "fantastic and amarze whatsapp", "very goad whatsapp", "see profile picture", "a", "star", "experience", "amazing app", "archive community group", "awesome", "be multi device support", "bring old status", "chat app", "enjoy app", "enjoy service", "excellence", "excellent service", "exellent", "fantastic", "free plastinee", "get code", "get notification", "good job", "good whatsapp", "good working", "great", "have app", "have issue", "have question", "improve quality", "install whatsapp", "late update attempt", "life", "like new update", "like update", "like way", "like whatsapp", "mast", "message", "mwaj hasani", "nandini", "need improvement", "nice aap", "nice application", "nice work", "post video", "receive call", "receive message", "receive verification code", "remove channel feature", "remove channel option", "see status", "send message", "send verification code", "send video", "share", "superb", "trash", "uninstall whatsapp", "update app", "useful app", "use", "veri", "very bad app", "very bad experience", "very good whatsapp", "very helpful great app", "very useful app", "whatapp", "wonderful", "work", "yas"] \ No newline at end of file