From 32aa6ef3aace78e4033600e31e6e92ba5908078d Mon Sep 17 00:00:00 2001 From: mtiessler Date: Mon, 30 Sep 2024 09:33:27 +0200 Subject: [PATCH 1/6] creating .env and preprocessing_service.py --- backend/dendogram_service.py | 163 +++---------------------------- backend/preprocessing_service.py | 118 ++++++++++++++++++++++ 2 files changed, 134 insertions(+), 147 deletions(-) create mode 100644 backend/preprocessing_service.py diff --git a/backend/dendogram_service.py b/backend/dendogram_service.py index bb6b8b38..f9adb356 100644 --- a/backend/dendogram_service.py +++ b/backend/dendogram_service.py @@ -1,22 +1,10 @@ -import stanza -import unicodedata -import contractions -import re -import string -import spacy +import requests import json import os -from spellchecker import SpellChecker -from .Context import Context -from . import Affinity_strategy - def preprocessed_app(app_name): file_path = f"static/preprocessed_jsons/{app_name}Features.json" - if os.path.exists(file_path) and os.path.getsize(file_path) > 0: - return True - return False - + return os.path.exists(file_path) and os.path.getsize(file_path) > 0 def save_preprocessed_features(features, app_name): file_path = f"static/preprocessed_jsons/{app_name}Features.json" @@ -24,7 +12,6 @@ def save_preprocessed_features(features, app_name): with open(file_path, "w") as json_file: json.dump(features, json_file) - def load_saved_preprocessed_features(app_name): file_path = f"static/preprocessed_jsons/{app_name}Features.json" if not os.path.exists(file_path): @@ -33,7 +20,6 @@ def load_saved_preprocessed_features(app_name): return json.load(json_file) return None - def generate_dendogram(preprocessing, embedding, linkage, @@ -45,25 +31,12 @@ def generate_dendogram(preprocessing, features = request_content['features'] if preprocessing and not preprocessed_app(app_name): - features = preprocess_features(features) + # Call the external preprocessing service + features = call_preprocessing_service(features) save_preprocessed_features(features, app_name) elif preprocessing and preprocessed_app(app_name): features = load_saved_preprocessed_features(app_name) - - - # if embedding == 'tf-idf-cosine' or embedding == 'all': - # context = Context(Affinity_strategy.TfIdfCosineAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - # if embedding == 'tf-idf-euclidean' or embedding == 'all': - # context = Context(Affinity_strategy.TfIdfEuclideanAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - # if embedding == 'bert-embedding-euclidean' or embedding == 'all': - # context = Context(Affinity_strategy.BERTEuclideanEmbeddingAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - if embedding == 'bert-embedding-cosine' or embedding == 'all': context = Context(Affinity_strategy.BERTCosineEmbeddingAffinity()) return context.use_affinity_algorithm(application_name=app_name, @@ -73,121 +46,17 @@ def generate_dendogram(preprocessing, verb_weight=verb_weight, distance_threshold=distance_threshold) - # if embedding == 'paraphrase-MiniLM-cosine' or embedding == 'all': - # context = Context(Affinity_strategy.ParaphraseMiniLMCosineEmbeddingAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - # if embedding == 'paraphrase-MiniLM-euclidean' or embedding == 'all': - # context = Context(Affinity_strategy.ParaphraseMiniLMEuclideanEmbeddingAffinity()) - # model_file_name = context.use_affinity_algorithm(app_name, features, linkage, distance_threshold) - - - -# TODO preprocess service -def is_english(text): - pattern = re.compile(r'^[a-zA-Z0-9\s.,?!\'"-]+$') - return bool(pattern.match(text)) - - -def is_emoji_only(text): - emoji_pattern = re.compile( - "[\U00010000-\U0010FFFF]+", - flags=re.UNICODE - ) - return bool(emoji_pattern.fullmatch(text)) - - -def contains_weird_characters(text): - weird_characters_pattern = re.compile(r'[^a-zA-Z0-9\s.,?!\'"_-]') - return bool(weird_characters_pattern.search(text)) - - -def preprocess_features(features): - preprocessed_features = [] - for feature in features: - if not is_emoji_only(feature) and not contains_weird_characters(feature): - preprocessed_feature = preprocess_feature(feature) - if is_english(preprocessed_feature): - preprocessed_features.append(preprocessed_feature) - - return preprocessed_features - - -def preprocess_feature(feature): - feature = feature.replace('_', ' ') - feature = remove_mentions_and_tags(feature) - # feature = remove_numbers(feature) TODO Check with Quim - feature = camel_case_to_words(feature) - feature = expand_contractions(feature) - feature = remove_special_characters(feature) - feature = remove_punctuation(feature) - feature = standarize_accents(feature) - # feature = spell_check(feature) - feature = lemmatize_spacy(feature) - # feature = lemmatize_stanza(feature) - feature = feature.lower() - return feature - - -def expand_contractions(feature): - expanded_words = [] - for word in feature.split(): - expanded_words.append(contractions.fix(word)) - return ' '.join(expanded_words) - - -def standarize_accents(feature): - return unicodedata.normalize('NFKD', feature).encode('ascii', 'ignore').decode('utf-8', 'ignore') - - -def remove_mentions_and_tags(text): - text = re.sub(r'@\S*', '', text) - return re.sub(r'#\S*', '', text) - - -def remove_special_characters(text): - pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' - return re.sub(pat, '', text) - - -def remove_numbers(text): - pattern = r'[^a-zA-z.,!?/:;\"\'\s]' - return re.sub(pattern, '', text) - - -def remove_punctuation(text): - return ''.join([c for c in text if c not in string.punctuation]) - - -def camel_case_to_words(camel_case_str): - words = re.sub('([a-z])([A-Z])', r'\1 \2', camel_case_str) - return words - - -def lemmatize_spacy(feature): - spacy.prefer_gpu() - nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) - doc = nlp(feature) - return " ".join([token.lemma_ for token in doc]) - - -def lemmatize_stanza(feature): - stanza.download('en') - nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma') - doc = nlp(feature) - lemmatized_feature = ' '.join([word.lemma for sent in doc.sentences for word in sent.words]) - return lemmatized_feature - +def call_preprocessing_service(features): + url = "http://localhost:3008/preprocess" + data = { + "features": features + } -def spell_check(feature): - spell = SpellChecker() - corrected_feature = [] - for word in feature.split(): - corrected_word = spell.correction(word) - if corrected_word is not None: - corrected_feature.append(corrected_word) + try: + response = requests.post(url, json=data) + if response.status_code == 200: + return response.json()['preprocessed_features'] else: - corrected_feature.append(word) - if corrected_feature is None: - return "" - return " ".join(corrected_feature) + raise Exception(f"Failed to preprocess features. Status code: {response.status_code}, Response: {response.text}") + except Exception as e: + raise Exception(f"Error occurred while calling preprocessing service: {str(e)}") diff --git a/backend/preprocessing_service.py b/backend/preprocessing_service.py new file mode 100644 index 00000000..c1ae836b --- /dev/null +++ b/backend/preprocessing_service.py @@ -0,0 +1,118 @@ +# preprocessing_service.py + +from flask import Flask, request, jsonify +import os +import json +from spellchecker import SpellChecker +import stanza +import unicodedata +import contractions +import re +import string +import spacy + +app = Flask(__name__) + +@app.route('/preprocess', methods=['POST']) +def preprocess(): + try: + request_data = request.get_json() + app_name = request_data['app_name'] + features = request_data['features'] + + if 'preprocessing' in request_data and request_data['preprocessing']: + # Preprocess features + features = preprocess_features(features) + save_preprocessed_features(features, app_name) + else: + features = load_saved_preprocessed_features(app_name) + + return jsonify({"preprocessed_features": features}), 200 + + except Exception as e: + return jsonify({"error": str(e)}), 500 + +def preprocessed_app(app_name): + file_path = f"static/preprocessed_jsons/{app_name}Features.json" + return os.path.exists(file_path) and os.path.getsize(file_path) > 0 + +def save_preprocessed_features(features, app_name): + file_path = f"static/preprocessed_jsons/{app_name}Features.json" + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w") as json_file: + json.dump(features, json_file) + +def load_saved_preprocessed_features(app_name): + file_path = f"static/preprocessed_jsons/{app_name}Features.json" + if os.path.exists(file_path): + with open(file_path, "r") as json_file: + return json.load(json_file) + return None + +def preprocess_features(features): + preprocessed_features = [] + for feature in features: + if not is_emoji_only(feature) and not contains_weird_characters(feature): + preprocessed_feature = preprocess_feature(feature) + if is_english(preprocessed_feature): + preprocessed_features.append(preprocessed_feature) + + return preprocessed_features + +def preprocess_feature(feature): + feature = feature.replace('_', ' ') + feature = remove_mentions_and_tags(feature) + feature = camel_case_to_words(feature) + feature = expand_contractions(feature) + feature = remove_special_characters(feature) + feature = remove_punctuation(feature) + feature = standarize_accents(feature) + feature = lemmatize_spacy(feature) + feature = feature.lower() + return feature + +# Utility functions (these are the same as from your original code) +def expand_contractions(feature): + expanded_words = [] + for word in feature.split(): + expanded_words.append(contractions.fix(word)) + return ' '.join(expanded_words) + +def standarize_accents(feature): + return unicodedata.normalize('NFKD', feature).encode('ascii', 'ignore').decode('utf-8', 'ignore') + +def remove_mentions_and_tags(text): + text = re.sub(r'@\S*', '', text) + return re.sub(r'#\S*', '', text) + +def remove_special_characters(text): + pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' + return re.sub(pat, '', text) + +def remove_punctuation(text): + return ''.join([c for c in text if c not in string.punctuation]) + +def camel_case_to_words(camel_case_str): + words = re.sub('([a-z])([A-Z])', r'\1 \2', camel_case_str) + return words + +def lemmatize_spacy(feature): + spacy.prefer_gpu() + nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) + doc = nlp(feature) + return " ".join([token.lemma_ for token in doc]) + +def is_english(text): + pattern = re.compile(r'^[a-zA-Z0-9\s.,?!\'"-]+$') + return bool(pattern.match(text)) + +def is_emoji_only(text): + emoji_pattern = re.compile("[\U00010000-\U0010FFFF]+", flags=re.UNICODE) + return bool(emoji_pattern.fullmatch(text)) + +def contains_weird_characters(text): + weird_characters_pattern = re.compile(r'[^a-zA-Z0-9\s.,?!\'"_-]') + return bool(weird_characters_pattern.search(text)) + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=5000) From 873808bfc0b328b033b6422d6cb6f27edcc07c62 Mon Sep 17 00:00:00 2001 From: mtiessler Date: Mon, 30 Sep 2024 09:41:06 +0200 Subject: [PATCH 2/6] creating .env and preprocessing_service.py --- .env | 2 ++ Pipfile | 19 ++++++++++--------- backend/dendogram_controller.py | 1 - backend/dendogram_service.py | 7 +++++-- 4 files changed, 17 insertions(+), 12 deletions(-) create mode 100644 .env diff --git a/.env b/.env new file mode 100644 index 00000000..9584a6ce --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +DG_SERVICE_URL=http://localhost +DG_SERVICE_URL=3008 diff --git a/Pipfile b/Pipfile index 9d405f6f..8f0bc7e6 100644 --- a/Pipfile +++ b/Pipfile @@ -5,14 +5,15 @@ name = "pypi" [packages] flask = ">=2.0,<3.0" -stanza = "*" -joblib = "*" -scikit-learn = "*" -matplotlib = "*" -transformers = "*" -contractions = "*" -spacy = "*" -pyspellchecker = "*" +stanza = "1.3.0" +joblib = "1.1.0" +scikit-learn = "1.0.2" +matplotlib = "3.5.1" +transformers = "4.15.0" +contractions = "0.1.72" +spacy = "3.2.0" +pyspellchecker = "0.6.2" +python-dotenv = "0.19.2" [requires] -python_version = "3.9" \ No newline at end of file +python_version = "3.9" diff --git a/backend/dendogram_controller.py b/backend/dendogram_controller.py index 80f96a3b..e53530e8 100644 --- a/backend/dendogram_controller.py +++ b/backend/dendogram_controller.py @@ -23,7 +23,6 @@ def generate_dendogram(): if request_content['features'] is None: return make_response("No features", 400) - # TODO we should add the params in the request body, too many dendogram_file = dendogram_service.generate_dendogram(preprocessing, affinity, linkage, diff --git a/backend/dendogram_service.py b/backend/dendogram_service.py index f9adb356..150069b8 100644 --- a/backend/dendogram_service.py +++ b/backend/dendogram_service.py @@ -1,7 +1,11 @@ import requests import json import os +from .Context import Context +from . import Affinity_strategy +from dotenv import load_dotenv +load_dotenv() def preprocessed_app(app_name): file_path = f"static/preprocessed_jsons/{app_name}Features.json" return os.path.exists(file_path) and os.path.getsize(file_path) > 0 @@ -31,13 +35,12 @@ def generate_dendogram(preprocessing, features = request_content['features'] if preprocessing and not preprocessed_app(app_name): - # Call the external preprocessing service features = call_preprocessing_service(features) save_preprocessed_features(features, app_name) elif preprocessing and preprocessed_app(app_name): features = load_saved_preprocessed_features(app_name) - if embedding == 'bert-embedding-cosine' or embedding == 'all': + if embedding == 'bert-embedding-cosine': context = Context(Affinity_strategy.BERTCosineEmbeddingAffinity()) return context.use_affinity_algorithm(application_name=app_name, data=features, From 635a42e8710aa14572058270ef2b59b1e17d6d8f Mon Sep 17 00:00:00 2001 From: mtiessler Date: Mon, 30 Sep 2024 10:36:06 +0200 Subject: [PATCH 3/6] better visualizator, trying to fix bugs in dendogram generation --- Pipfile | 18 +++--- backend/Affinity_strategy.py | 15 +++-- backend/dendogram_service.py | 17 ++++-- client/visualizator.py | 55 ++++++++++++------- .../viber.voipFeatures.json | 1 + .../preprocessed_jsons/whatsappFeatures.json | 1 + 6 files changed, 66 insertions(+), 41 deletions(-) create mode 100644 static/preprocessed_jsons/viber.voipFeatures.json create mode 100644 static/preprocessed_jsons/whatsappFeatures.json diff --git a/Pipfile b/Pipfile index 8f0bc7e6..fd63bdd9 100644 --- a/Pipfile +++ b/Pipfile @@ -5,15 +5,15 @@ name = "pypi" [packages] flask = ">=2.0,<3.0" -stanza = "1.3.0" -joblib = "1.1.0" -scikit-learn = "1.0.2" -matplotlib = "3.5.1" -transformers = "4.15.0" -contractions = "0.1.72" -spacy = "3.2.0" -pyspellchecker = "0.6.2" -python-dotenv = "0.19.2" +stanza = "*" +joblib = "*" +scikit-learn = "*" +matplotlib = "*" +transformers = "*" +contractions = "*" +spacy = "*" +pyspellchecker = "*" +python-dotenv = "*" [requires] python_version = "3.9" diff --git a/backend/Affinity_strategy.py b/backend/Affinity_strategy.py index ac12d05e..af12f182 100644 --- a/backend/Affinity_strategy.py +++ b/backend/Affinity_strategy.py @@ -76,31 +76,30 @@ def compute_affinity(self, def process_batch(batch_data, batch_index): print(f"Processing batch {batch_index + 1}/{(len(data) + batch_size - 1) // batch_size}...") - # Tokenize and pad sentences in the batch tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in batch_data] max_len = max(len(sent) for sent in tokenized_sentences) padded_sentences = [sent + [tokenizer.pad_token_id] * (max_len - len(sent)) for sent in tokenized_sentences] input_ids = torch.tensor(padded_sentences) - # Get BERT embeddings print(f"Getting BERT embeddings for batch {batch_index + 1}...") with torch.no_grad(): outputs = model(input_ids) embeddings = outputs.last_hidden_state[:, 0, :] # CLS token embeddings - # Apply verb and object weights print(f"Applying verb and object weights for batch {batch_index + 1}...") tagged_data = [nlp(sent) for sent in batch_data] + for i, doc in enumerate(tagged_data): for token in doc: - if token.pos_ == 'VERB' and verb_weight != 0: - embeddings[i] += verb_weight * embeddings[i] - elif token.pos_ == 'NOUN' and object_weight != 0: - embeddings[i] += object_weight * embeddings[i] + token_position = token.pos_ + if token_position == 'VERB' and verb_weight != 0: + embeddings[i] *= (1 + verb_weight) + elif token_position == 'NOUN' and object_weight != 0: + embeddings[i] *= (1 + object_weight) return embeddings - all_embeddings = [] # To store all batches of embeddings + all_embeddings = [] batch_size = 32 print(f"Processing data in batches of size {batch_size}...") diff --git a/backend/dendogram_service.py b/backend/dendogram_service.py index 150069b8..64e6a243 100644 --- a/backend/dendogram_service.py +++ b/backend/dendogram_service.py @@ -49,17 +49,26 @@ def generate_dendogram(preprocessing, verb_weight=verb_weight, distance_threshold=distance_threshold) + def call_preprocessing_service(features): - url = "http://localhost:3008/preprocess" + url = os.getenv("DG_SERVICE_URL") + port = os.getenv("DG_SERVICE_PORT") + + if not url or not port: + raise Exception("Preprocessing service URL or port not found in environment variables.") + + full_url = f"{url}:{port}/preprocess" + data = { "features": features } try: - response = requests.post(url, json=data) + response = requests.post(full_url, json=data) if response.status_code == 200: return response.json()['preprocessed_features'] else: - raise Exception(f"Failed to preprocess features. Status code: {response.status_code}, Response: {response.text}") + raise Exception( + f"Failed to preprocess features. Status code: {response.status_code}, Response: {response.text}") except Exception as e: - raise Exception(f"Error occurred while calling preprocessing service: {str(e)}") + raise Exception(f"Error occurred while calling preprocessing service: {str(e)}") \ No newline at end of file diff --git a/client/visualizator.py b/client/visualizator.py index 986ac05f..1e003368 100644 --- a/client/visualizator.py +++ b/client/visualizator.py @@ -3,8 +3,11 @@ from scipy.cluster.hierarchy import dendrogram import joblib import argparse +import os CLUSTER_COLOR_THRESHOLD = 0.08 + + def add_line_breaks(labels): return [label.replace(' ', '\n') for label in labels] @@ -25,18 +28,14 @@ def plot_dendrogram(model, labels, **kwargs): [model.children_, model.distances_, counts] ).astype(float) labels = add_line_breaks(labels=labels) + dendrogram(linkage_matrix, labels=labels, color_threshold=CLUSTER_COLOR_THRESHOLD, - ** kwargs) - ''' - for i, d, c in zip(linkage_matrix[:, 0], linkage_matrix[:, 1], linkage_matrix[:, 2]): - x = 0 - y = c - plt.annotate('%.2f' % c, (x, y), xytext=(550, 0), - textcoords='offset points', va='top', ha='center') - ''' - plt.xticks(rotation=90, fontsize=10) + leaf_font_size=10, + **kwargs) + + plt.xticks(rotation=90, fontsize=10, ha='right') def show_dendrogram(model_file): @@ -46,35 +45,51 @@ def show_dendrogram(model_file): model = file['model'] affinity = file['affinity'] labels = file['labels'] + try: verb_weight = file['verb_weight'] except KeyError: verb_weight = 'N/A' - try: + + try: object_weight = file['object_weight'] except KeyError: object_weight = 'N/A' if hasattr(model, 'children_'): - plt.figure(figsize=(15, 8)) + n_leaves = len(labels) + figsize_width = max(100, n_leaves * 0.75) + plt.figure(figsize=(figsize_width, 12)) plot_dendrogram(model, labels=labels) + plt.title(application_name + ' | ' + affinity + ' | Distance Threshold: ' + str(distance_threshold) + ' | Verb Weight: ' + str(verb_weight) - + ' | Object weight: ' + str(object_weight)) - plt.xlabel('Features', fontsize=12) - plt.ylabel('Distance', fontsize=12) - plt.xticks(rotation=90, fontsize=8) + + ' | Object weight: ' + str(object_weight), + fontsize=14) + plt.xlabel('Features', fontsize=14) + plt.ylabel('Distance', fontsize=14) + plt.subplots_adjust(bottom=0.2) plt.tight_layout() - plt.show() + + save_directory = r"C:\Users\Max\NLP4RE\Dendogram-Generator\static\png" + base_name = os.path.splitext(os.path.basename(model_file))[0] + save_path = os.path.join(save_directory, f"{base_name}.png") + plt.savefig(save_path) + plt.close() + print(f"Dendrogram saved at: {save_path}") else: raise ValueError("The provided model is not AgglomerativeClustering.") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Visualize dendrogram from a model file.") - parser.add_argument("model_file", help="Path to the model file") - args = parser.parse_args() + # Path to the directory containing .pkl files + pkls_directory = r"C:\Users\Max\NLP4RE\Dendogram-Generator\static\pkls" - show_dendrogram(args.model_file) + # Iterate over each .pkl file in the directory + for filename in os.listdir(pkls_directory): + if filename.endswith('.pkl'): + model_file = os.path.join(pkls_directory, filename) + print(f"Processing: {model_file}") + show_dendrogram(model_file) \ No newline at end of file diff --git a/static/preprocessed_jsons/viber.voipFeatures.json b/static/preprocessed_jsons/viber.voipFeatures.json new file mode 100644 index 00000000..b03bfa37 --- /dev/null +++ b/static/preprocessed_jsons/viber.voipFeatures.json @@ -0,0 +1 @@ +["really poor performance", "nice satisfactory", "change rating", "drop call", "lifeline", "easy contact", "face problem", "ek baar bnd krte", "great o", "need more update", "dilip", "add option", "bad whatsapp", "grand app", "very low performance", "cf p", "make video call", "activate phone number", "very gud", "add folder feature", "viber", "restore datum", "have fold", "excellence"] \ No newline at end of file diff --git a/static/preprocessed_jsons/whatsappFeatures.json b/static/preprocessed_jsons/whatsappFeatures.json new file mode 100644 index 00000000..d90db439 --- /dev/null +++ b/static/preprocessed_jsons/whatsappFeatures.json @@ -0,0 +1 @@ +["good app", "nice app", "absolute bias", "love app", "whatsapp", "bad app", "like app", "very nice app", "great app", "good", "love", "op", "use app", "very good app", "access whatsapp", "send code", "add feature", "app", "face issue", "love whatsapp", "new update", "update whatsapp", "download app", "good application", "good experience", "have problem", "open whatsapp", "remove channel", "send photo", "thank", "use whatsapp", "message app", "bad update", "ban account", "face problem", "freedom", "get verification code", "give star", "good work", "great experience", "hate new update", "open app", "supar", "very bad update", "very nice", "add option", "improve platform", "fantastic and amarze whatsapp", "very goad whatsapp", "see profile picture", "a", "star", "experience", "amazing app", "archive community group", "awesome", "be multi device support", "bring old status", "chat app", "enjoy app", "enjoy service", "excellence", "excellent service", "exellent", "fantastic", "free plastinee", "get code", "get notification", "good job", "good whatsapp", "good working", "great", "have app", "have issue", "have question", "improve quality", "install whatsapp", "late update attempt", "life", "like new update", "like update", "like way", "like whatsapp", "mast", "message", "mwaj hasani", "nandini", "need improvement", "nice aap", "nice application", "nice work", "post video", "receive call", "receive message", "receive verification code", "remove channel feature", "remove channel option", "see status", "send message", "send verification code", "send video", "share", "superb", "trash", "uninstall whatsapp", "update app", "useful app", "use", "veri", "very bad app", "very bad experience", "very good whatsapp", "very helpful great app", "very useful app", "whatapp", "wonderful", "work", "yas"] \ No newline at end of file From aeb3b00be4546e48e09786bf7e411d69d5c4ae51 Mon Sep 17 00:00:00 2001 From: mtiessler Date: Mon, 30 Sep 2024 11:29:58 +0200 Subject: [PATCH 4/6] rollback --- backend/Affinity_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/Affinity_strategy.py b/backend/Affinity_strategy.py index af12f182..ed144499 100644 --- a/backend/Affinity_strategy.py +++ b/backend/Affinity_strategy.py @@ -93,9 +93,9 @@ def process_batch(batch_data, batch_index): for token in doc: token_position = token.pos_ if token_position == 'VERB' and verb_weight != 0: - embeddings[i] *= (1 + verb_weight) + embeddings[i] += verb_weight * embeddings[i] elif token_position == 'NOUN' and object_weight != 0: - embeddings[i] *= (1 + object_weight) + embeddings[i] += object_weight * embeddings[i] return embeddings From b01d1640dda9aef7679a5c4128ec9d8c6e459077 Mon Sep 17 00:00:00 2001 From: mtiessler Date: Mon, 30 Sep 2024 15:11:06 +0200 Subject: [PATCH 5/6] added csv generation and png gen --- backend/Affinity_strategy.py | 19 ++++++++++++++++--- client/visualizator.py | 13 ++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/backend/Affinity_strategy.py b/backend/Affinity_strategy.py index ed144499..20bcdca5 100644 --- a/backend/Affinity_strategy.py +++ b/backend/Affinity_strategy.py @@ -9,9 +9,10 @@ import os import joblib import torch +import pandas as pd MODEL_DIRECTORY_PATH = 'static' + os.path.sep + 'pkls' - +MODEL_DIRECTORY_CSV_PATH = 'static' + os.path.sep + 'csv' class AffinityStrategy(): @abstractmethod @@ -56,7 +57,6 @@ def compute_affinity(self, data: List): # file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_PATH, file_name) # joblib.dump(model_info, file_path) # return file_path - class BERTCosineEmbeddingAffinity(AffinityStrategy): def compute_affinity(self, @@ -128,6 +128,19 @@ def process_batch(batch_data, batch_index): clustering_model.fit(dense_data_array) + # Get labels from clustering results + labels = clustering_model.labels_ + + # Create a DataFrame to save the clustering results + df_results = pd.DataFrame({'Sentence': data, 'Cluster': labels}) + + # Save the DataFrame to a CSV file + csv_file_name = f"{application_name}_bert_cosine_{linkage}_results.csv" + csv_file_path = os.path.join(os.getcwd(), MODEL_DIRECTORY_CSV_PATH, csv_file_name) + + print(f"Saving clustering results to {csv_file_path}...") + df_results.to_csv(csv_file_path, index=False) + # Save the clustering model and other information print("Saving the clustering model and metadata...") model_info = { @@ -147,7 +160,7 @@ def process_batch(batch_data, batch_index): joblib.dump(model_info, file_path) print("Process completed.") - return file_path + return csv_file_path, file_path # class BERTEuclideanEmbeddingAffinity(AffinityStrategy): # def compute_affinity(self, application_name, data: List, linkage, distance_threshold): diff --git a/client/visualizator.py b/client/visualizator.py index 1e003368..962d0787 100644 --- a/client/visualizator.py +++ b/client/visualizator.py @@ -58,8 +58,14 @@ def show_dendrogram(model_file): if hasattr(model, 'children_'): n_leaves = len(labels) - figsize_width = max(100, n_leaves * 0.75) - plt.figure(figsize=(figsize_width, 12)) + + max_figsize_width = 30 + max_figsize_height = 15 + figsize_width = min(max_figsize_width, n_leaves * 0.5) + figsize_height = max(12, min(max_figsize_height, n_leaves * 0.25)) + + plt.figure(figsize=(figsize_width, figsize_height)) + plot_dendrogram(model, labels=labels) plt.title(application_name @@ -82,12 +88,9 @@ def show_dendrogram(model_file): else: raise ValueError("The provided model is not AgglomerativeClustering.") - if __name__ == "__main__": - # Path to the directory containing .pkl files pkls_directory = r"C:\Users\Max\NLP4RE\Dendogram-Generator\static\pkls" - # Iterate over each .pkl file in the directory for filename in os.listdir(pkls_directory): if filename.endswith('.pkl'): model_file = os.path.join(pkls_directory, filename) From 36736b881d7e2fce9c64e133adae4c9b4e54926b Mon Sep 17 00:00:00 2001 From: mtiessler Date: Mon, 30 Sep 2024 15:11:17 +0200 Subject: [PATCH 6/6] added pandas --- Pipfile | 1 + static/feature_preprocessings/Readme.md | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 static/feature_preprocessings/Readme.md diff --git a/Pipfile b/Pipfile index fd63bdd9..a6e4ecaa 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,7 @@ contractions = "*" spacy = "*" pyspellchecker = "*" python-dotenv = "*" +pandas = "*" [requires] python_version = "3.9" diff --git a/static/feature_preprocessings/Readme.md b/static/feature_preprocessings/Readme.md deleted file mode 100644 index 34f5454d..00000000 --- a/static/feature_preprocessings/Readme.md +++ /dev/null @@ -1 +0,0 @@ -Feature preprocessing files \ No newline at end of file