Skip to content

Commit

Permalink
version ok, subplots more less
Browse files Browse the repository at this point in the history
  • Loading branch information
mtiessler committed Nov 27, 2024
1 parent 4d8cd97 commit 119e189
Showing 1 changed file with 83 additions and 41 deletions.
124 changes: 83 additions & 41 deletions client/visualizator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch


model_name = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(
Expand Down Expand Up @@ -48,6 +47,68 @@ def generate_dynamic_label(cluster_labels):
return label.split('\n')[0]


def build_hierarchical_json(linkage_matrix, labels):
n_samples = len(labels)

def traverse_node(node_id):
if node_id < n_samples:
# Leaf node
return {"id": int(node_id), "label": labels[node_id]}
else:
# Internal node
left_child = int(linkage_matrix[node_id - n_samples, 0])
right_child = int(linkage_matrix[node_id - n_samples, 1])
distance = float(linkage_matrix[node_id - n_samples, 2])

return {
"id": int(node_id),
"distance": distance,
"children": [traverse_node(left_child), traverse_node(right_child)],
}

# Start from the root node
return traverse_node(len(linkage_matrix) + n_samples - 1)


def build_hierarchical_json_for_subcluster(linkage_matrix, cluster_labels):
"""
Build a hierarchical JSON structure for a sub-cluster.
Args:
linkage_matrix (ndarray): Linkage matrix for the sub-cluster.
cluster_labels (list): List of labels for the sub-cluster's leaf nodes.
Returns:
dict: Hierarchical JSON structure.
"""
n_samples = len(cluster_labels)

def traverse_node(node_id):
if node_id < n_samples:
# Leaf node
return {"id": int(node_id), "label": cluster_labels[node_id]}
else:
# Internal node
left_child = int(linkage_matrix[node_id - n_samples, 0])
right_child = int(linkage_matrix[node_id - n_samples, 1])
distance = float(linkage_matrix[node_id - n_samples, 2])

return {
"id": int(node_id),
"distance": distance,
"children": [traverse_node(left_child), traverse_node(right_child)],
}

# Start from the root node
return traverse_node(len(linkage_matrix) + n_samples - 1)


def save_json(data, file_path):
with open(file_path, 'w') as json_file:
json.dump(data, json_file, indent=4)
print(f"JSON saved at: {file_path}")


def render_dendrogram_and_process_clusters(model_info, model, labels, color_threshold, distance_threshold):
application_name = model_info['application_name']
affinity = model_info['affinity']
Expand All @@ -61,8 +122,10 @@ def render_dendrogram_and_process_clusters(model_info, model, labels, color_thre
reset_folder(app_folder)
os.makedirs(app_folder, exist_ok=True)

# Linkage matrix for the hierarchical clustering
linkage_matrix = np.column_stack([model.children_, model.distances_, np.zeros(len(model.children_))]).astype(float)

# Generate and save the general dendrogram
fig, ax = plt.subplots(figsize=(30, 30))
dendrogram_result = dendrogram(
linkage_matrix,
Expand All @@ -79,7 +142,13 @@ def render_dendrogram_and_process_clusters(model_info, model, labels, color_thre
f"| Verb Weight: {verb_weight} | Object Weight: {object_weight}",
fontsize=14
)
# Extract clusters based on colored labels, ignoring grey clusters
plt.tight_layout()
final_dendrogram_path = os.path.join(app_folder, f"{application_name}_final_dendrogram.png")
plt.savefig(final_dendrogram_path)
plt.close(fig)
print(f"Final dendrogram saved at: {final_dendrogram_path}")

# Extract clusters based on colors
cluster_map = {}
for leaf, color in zip(dendrogram_result['leaves'], dendrogram_result['leaves_color_list']):
if color == 'grey':
Expand All @@ -88,18 +157,17 @@ def render_dendrogram_and_process_clusters(model_info, model, labels, color_thre
cluster_map[color] = []
cluster_map[color].append(labels[leaf])

plt.tight_layout()
final_dendrogram_path = os.path.join(app_folder, f"{application_name}_final_dendrogram.png")
plt.savefig(final_dendrogram_path)
plt.close(fig)
print(f"Final dendrogram saved at: {final_dendrogram_path}")
# Save hierarchical JSON for the general dendrogram
general_json = build_hierarchical_json(linkage_matrix, labels)
general_json_path = os.path.join(app_folder, f"{application_name}_general_hierarchy.json")
save_json(general_json, general_json_path)

# Process and save the individual clusters
process_and_save_clusters(cluster_map, application_name, app_folder)

return cluster_map



def process_and_save_clusters(cluster_map, application_name, app_folder):
final_csv_data = []
for cluster_id, (color, cluster_labels) in enumerate(cluster_map.items(), start=1):
Expand All @@ -118,14 +186,12 @@ def process_and_save_clusters(cluster_map, application_name, app_folder):
cluster_df.to_csv(cluster_csv_path, index=False, sep=',')
print(f"Cluster {cluster_id} saved to {cluster_csv_path}")

# Save cluster details to a JSON
cluster_json_path = os.path.join(cluster_folder, f"{cluster_label}.json")
with open(cluster_json_path, 'w') as json_file:
json.dump({"Cluster Name": dynamic_label, "Feature List": cluster_labels}, json_file, indent=4)
print(f"Cluster {cluster_id} JSON saved at {cluster_json_path}")

# Generate and save an individual dendrogram
generate_individual_dendrogram(cluster_labels, cluster_id, application_name, cluster_label, cluster_folder)
# Generate and save hierarchical JSON for the sub-cluster
filtered_data = np.random.rand(len(cluster_labels), 2) # Generate dummy data for sub-cluster
sub_linkage_matrix = linkage(filtered_data, method='ward')
sub_json = build_hierarchical_json_for_subcluster(sub_linkage_matrix, cluster_labels)
sub_json_path = os.path.join(cluster_folder, f"{cluster_label}_hierarchy.json")
save_json(sub_json, sub_json_path)

# Append to final CSV summary data
final_csv_data.append({
Expand All @@ -141,33 +207,9 @@ def process_and_save_clusters(cluster_map, application_name, app_folder):
print(f"Final summary CSV saved at: {final_csv_path}")


def generate_individual_dendrogram(cluster_labels, cluster_id, application_name, cluster_label, output_folder):
if len(cluster_labels) < 2:
print(f"Cluster {cluster_id} has less than 2 labels, skipping dendrogram generation.")
return
dummy_data = np.random.rand(len(cluster_labels), 2)
linkage_matrix = linkage(dummy_data, method='ward')
fig, ax = plt.subplots(figsize=(10, 6))
dendrogram(
linkage_matrix,
labels=cluster_labels,
leaf_font_size=10,
orientation='right',
ax=ax
)
ax.set_title(f"{application_name} | Cluster {cluster_id} | {cluster_label}")
ax.set_xlabel("Distance")
ax.set_ylabel("Data Points")
dendrogram_path = os.path.join(output_folder, f"{cluster_label}_dendrogram.png")
plt.tight_layout()
plt.savefig(dendrogram_path)
plt.close()
print(f"Dendrogram for Cluster {cluster_id} saved at: {dendrogram_path}")


def generate_dendrogram_visualization(model_file):
model_info = joblib.load(model_file)
distance_threshold = 0.2
distance_threshold = model_info['distance_threshold']
clustering_model = model_info['model']
labels = model_info['labels']

Expand Down

0 comments on commit 119e189

Please sign in to comment.