-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics_evaluation.py
242 lines (193 loc) · 7.52 KB
/
metrics_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
"""
Metrics Evaluation Module for Keywords4CV
This module provides additional evaluation metrics and utilities for assessing
keyword extraction performance.
"""
import numpy as np
import pandas as pd
from typing import Dict, Set, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_curve, average_precision_score
class KeywordMetricsEvaluator:
"""
Evaluates keyword extraction performance with multiple metrics.
This class provides methods to calculate and visualize various metrics
for keyword extraction, including precision, recall, F1-score, and
more advanced metrics like Mean Average Precision.
"""
def __init__(self, original_skills: Set[str], expanded_skills: Set[str]):
"""
Initialize with reference skill sets.
Args:
original_skills: Set of original skills from the configuration
expanded_skills: Set of expanded skills (including synonyms and variants)
"""
self.original_skills = set(s.lower() for s in original_skills)
self.expanded_skills = set(s.lower() for s in expanded_skills)
def calculate_basic_metrics(self, extracted_keywords: Set[str]) -> Dict:
"""
Calculate basic keyword extraction metrics.
Args:
extracted_keywords: Set of keywords extracted by the system
Returns:
Dictionary containing precision, recall, and F1 metrics
"""
# Convert to lowercase for case-insensitive comparison
extracted = set(k.lower() for k in extracted_keywords)
# Metrics against original skills
original_recall = (
len(extracted & self.original_skills) / len(self.original_skills)
if self.original_skills
else 0
)
# Metrics against expanded skills
expanded_recall = (
len(extracted & self.expanded_skills) / len(self.expanded_skills)
if self.expanded_skills
else 0
)
precision = (
len(extracted & self.expanded_skills) / len(extracted) if extracted else 0
)
# Calculate F1 scores
original_f1 = (
2 * precision * original_recall / (precision + original_recall)
if precision + original_recall > 0
else 0
)
expanded_f1 = (
2 * precision * expanded_recall / (precision + expanded_recall)
if precision + expanded_recall > 0
else 0
)
# Add coverage metric: how many categories are represented in the extracted keywords
coverage = self._calculate_category_coverage(extracted)
return {
"original_recall": original_recall,
"expanded_recall": expanded_recall,
"precision": precision,
"original_f1": original_f1,
"expanded_f1": expanded_f1,
"category_coverage": coverage,
}
def _calculate_category_coverage(self, extracted_keywords: Set[str]) -> float:
"""
Calculate what percentage of skill categories are represented in extracted keywords.
This is a placeholder implementation that should be customized based on your
specific category structure.
Args:
extracted_keywords: Set of extracted keywords
Returns:
Coverage percentage (0.0 to 1.0)
"""
# Implement based on your category structure
# This is just a placeholder - implement according to your needs
return 0.0
def visualize_metrics(self, metrics_history: List[Dict], output_file: str = None):
"""
Visualize metrics over time or across different configurations.
Args:
metrics_history: List of metric dictionaries from different runs
output_file: Optional path to save the visualization
"""
if not metrics_history:
return
# Convert metrics history to DataFrame
df = pd.DataFrame(metrics_history)
# Create visualization
plt.figure(figsize=(10, 6))
# Plot metrics
for metric in [
"precision",
"original_recall",
"expanded_recall",
"expanded_f1",
]:
if metric in df.columns:
plt.plot(df.index, df[metric], label=metric)
plt.title("Keyword Extraction Performance Metrics")
plt.xlabel("Run")
plt.ylabel("Score")
plt.legend()
plt.grid(True, alpha=0.3)
if output_file:
plt.savefig(output_file, dpi=300, bbox_inches="tight")
else:
plt.show()
def calculate_advanced_metrics(
self, extracted_keywords: List[Tuple[str, float]]
) -> Dict:
"""
Calculate advanced metrics for ranked keyword extraction.
Args:
extracted_keywords: List of tuples with (keyword, confidence_score)
Returns:
Dictionary of advanced metrics
"""
# Sort by confidence score
extracted_keywords = sorted(
extracted_keywords, key=lambda x: x[1], reverse=True
)
# Calculate Mean Average Precision
y_true = [
1 if k[0].lower() in self.expanded_skills else 0 for k in extracted_keywords
]
y_scores = [k[1] for k in extracted_keywords]
if sum(y_true) == 0: # No relevant keywords found
return {"mean_avg_precision": 0.0}
try:
mean_avg_precision = average_precision_score(y_true, y_scores)
except:
mean_avg_precision = 0.0
return {"mean_avg_precision": mean_avg_precision}
def compare_metrics_across_runs(run_metrics: Dict[str, Dict]) -> pd.DataFrame:
"""
Compare metrics across different runs or configurations.
Args:
run_metrics: Dictionary mapping run_id to metrics dictionary
Returns:
DataFrame with comparative metrics
"""
# Create DataFrame with metrics from all runs
data = []
for run_id, metrics in run_metrics.items():
metrics_copy = metrics.copy()
metrics_copy["run_id"] = run_id
data.append(metrics_copy)
return pd.DataFrame(data)
def plot_metrics_comparison(metrics_df: pd.DataFrame, output_file: str = None):
"""
Create a comparison plot of metrics across runs.
Args:
metrics_df: DataFrame with metrics from compare_metrics_across_runs
output_file: Optional path to save the plot
"""
# Select numeric columns for plotting
numeric_cols = metrics_df.select_dtypes(
include=["float64", "int64"]
).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != "run_id"]
if not numeric_cols:
return
# Melt the DataFrame for easier plotting
melted_df = pd.melt(
metrics_df,
id_vars=["run_id"],
value_vars=numeric_cols,
var_name="Metric",
value_name="Value",
)
# Create plot
plt.figure(figsize=(12, 8))
sns.barplot(x="run_id", y="Value", hue="Metric", data=melted_df)
plt.title("Metrics Comparison Across Runs")
plt.xlabel("Run")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(title="Metric")
plt.tight_layout()
if output_file:
plt.savefig(output_file, dpi=300, bbox_inches="tight")
else:
plt.show()