Skip to content

Commit 0a4d2c7

Browse files
authored
Refactor: Modularize codebase and add extensive new features (#29)
This commit represents a major overhaul of the Keywords4CV codebase, introducing significant improvements in modularity, functionality, robustness, and performance. **Key Changes:** * **Modularization:** The monolithic `keywords4cv.py` has been refactored into multiple well-defined modules, improving code organization, maintainability, and testability. New modules include: `config_validation`, `keyword_canonicalizer`, `bk_tree_enhancement`, `multiprocess_helpers`, `cache_manager`, `exceptions`, `validation_utils`, `metrics_reporter`, and `metrics_evaluation`. * **New Features:** * **Keyword Canonicalization:** Deduplication and canonicalization of keywords using abbreviation expansion, n-gram overlap resolution, and embedding-based clustering. * **Enhanced BK-Tree:** Optimized fuzzy matching with adaptive caching. * **Comprehensive Metrics and Reporting:** Generation of detailed HTML reports with various metrics (precision, recall, F1, MAP, category coverage) and visualizations. * **Phrase-Level Synonyms:** Support for phrase-level synonyms from static files or via an API (with retries, caching, and circuit breaker). * **Contextual Validation:** Configurable context window for improved semantic validation. * **Custom Sentence Splitting:** Allows for custom rules to refine sentence segmentation. * **Dynamic Cache Sizing:** Cache sizes are adjusted dynamically based on available memory. * **Fuzzy Matching Order:** Configurable order of fuzzy matching and semantic validation. * **Robustness and Bug Fixes:** * **Improved Error Handling:** Extensive use of custom exceptions for better error management. * **Enhanced Input Validation:** Thorough validation of input data and configuration using `pydantic`. * **Memory Management:** Improved memory usage through generators, explicit `del` statements, and dynamic cache sizing. * **Concurrency Improvements:** Proper initialization of spaCy models in worker processes. * **Performance Optimizations:** * **Extensive Caching:** Caching of various operations (preprocessing, vectorization, fuzzy matching, API calls, validation). * **Generators:** Increased use of generators for memory efficiency. * **Optimized Data Structures:** Use of sets and LRUCache. * **Vectorized Operations:** Leveraging NumPy and spaCy's vectorized operations. * **Parallel Processing:** Improved parallel processing logic. * **Deduplications:** Fixed duplicates in config.yaml. This commit significantly enhances the functionality, reliability, and performance of Keywords4CV, making it a more powerful and user-friendly tool for keyword extraction. Signed-off-by: David Osipov <personal@david-osipov.vision>
1 parent d303bc2 commit 0a4d2c7

14 files changed

+4287
-2028
lines changed

_generate_ngrams.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
def _generate_ngrams(self, tokens: List[str], n: int) -> Set[str]:
2+
"""
3+
Generate n-grams from a list of tokens.
4+
5+
Args:
6+
tokens: List of tokens to generate n-grams from
7+
n: Size of n-grams to generate
8+
9+
Returns:
10+
Set[str]: Set of generated n-grams
11+
12+
Raises:
13+
ValueError: If n is not a positive integer
14+
"""
15+
if not isinstance(n, int) or n <= 0:
16+
raise ValueError(f"Invalid ngram size: {n}. Must be positive integer")
17+
18+
filtered_tokens = [
19+
token
20+
for token in tokens
21+
if len(token.strip()) > 1 and token not in self.preprocessor.stop_words
22+
]
23+
24+
if len(filtered_tokens) < n:
25+
return set()
26+
27+
# Simplified - removed redundant check since filtered_tokens already ensures len(t) > 1
28+
ngrams = {
29+
" ".join(filtered_tokens[i : i + n])
30+
for i in range(len(filtered_tokens) - (n - 1))
31+
}
32+
return ngrams

bk_tree_enhancement.py

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
"""
2+
Enhanced BK-Tree implementation with optimized fuzzy matching and caching.
3+
Can be imported by keywords4cv.py to replace the default BK-Tree implementation.
4+
"""
5+
6+
import logging
7+
from typing import List, Tuple, Set, Optional, Dict, Any
8+
import pybktree
9+
from Levenshtein import distance
10+
from rapidfuzz import fuzz
11+
from cachetools import LRUCache
12+
import numpy as np
13+
14+
logger = logging.getLogger(__name__)
15+
16+
17+
class EnhancedBKTree:
18+
"""Enhanced BK-Tree with optimized fuzzy matching and caching support."""
19+
20+
def __init__(self, items: List[str], cache_size: int = 1000):
21+
"""Initialize the enhanced BK-Tree with the provided items."""
22+
self.bk_tree = pybktree.BKTree(distance, items) if items else None
23+
self.cache = LRUCache(maxsize=cache_size)
24+
self._query_count = 0
25+
self._hit_count = 0
26+
27+
def find(
28+
self, query: str, threshold: int, limit: Optional[int] = None
29+
) -> List[Tuple[int, str]]:
30+
"""
31+
Find items within a certain Levenshtein distance threshold of the query string.
32+
33+
Args:
34+
query: String to search for
35+
threshold: Maximum Levenshtein distance allowed
36+
limit: Maximum number of results to return (optional)
37+
38+
Returns:
39+
List of (distance, item) pairs sorted by distance
40+
"""
41+
if not self.bk_tree:
42+
return []
43+
44+
self._query_count += 1
45+
cache_key = f"{query}_{threshold}_{limit}"
46+
47+
# Check cache first
48+
if cache_key in self.cache:
49+
self._hit_count += 1
50+
return self.cache[cache_key]
51+
52+
# Not in cache, perform the actual query
53+
try:
54+
# Get all matches within threshold
55+
matches = self.bk_tree.find(query, threshold)
56+
57+
# Sort by distance (should already be sorted, but making sure)
58+
matches.sort(key=lambda x: x[0])
59+
60+
# Apply limit if specified
61+
if limit is not None and limit > 0:
62+
matches = matches[:limit]
63+
64+
# Cache the results
65+
self.cache[cache_key] = matches
66+
return matches
67+
68+
except Exception as e:
69+
logger.error(f"Error in BK-Tree search for '{query}': {e}")
70+
return []
71+
72+
def get_hit_rate(self) -> float:
73+
"""Return the cache hit rate as a percentage."""
74+
if self._query_count == 0:
75+
return 0.0
76+
return (self._hit_count / self._query_count) * 100
77+
78+
def get_stats(self) -> Dict[str, Any]:
79+
"""Return statistics about the tree and cache usage."""
80+
return {
81+
"tree_size": len(self.bk_tree.tree) if self.bk_tree else 0,
82+
"cache_size": len(self.cache),
83+
"cache_maxsize": self.cache.maxsize,
84+
"queries": self._query_count,
85+
"hits": self._hit_count,
86+
"hit_rate": self.get_hit_rate(),
87+
}

cache_manager.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
Cache management utilities for Keywords4CV.
3+
"""
4+
5+
import json
6+
import os
7+
import xxhash
8+
import psutil
9+
from typing import Dict, Any, Optional
10+
from cachetools import LRUCache
11+
12+
# Constants
13+
DEFAULT_CACHE_SIZE = 5000
14+
DEFAULT_CACHE_SALT = "default_secret_salt"
15+
CACHE_VERSION = "1.0.0"
16+
17+
18+
def get_cache_salt(config: Dict[str, Any]) -> str:
19+
"""
20+
Retrieves the cache salt, prioritizing environment variables, then config, then a default.
21+
22+
Args:
23+
config: The configuration dictionary
24+
25+
Returns:
26+
str: The cache salt to use for hashing operations
27+
"""
28+
return os.environ.get(
29+
"K4CV_CACHE_SALT",
30+
config.get("caching", {}).get("cache_salt", DEFAULT_CACHE_SALT),
31+
)
32+
33+
34+
def calculate_optimal_cache_size(config: Dict[str, Any]) -> int:
35+
"""
36+
Calculate the optimal cache size based on available memory and configuration.
37+
38+
Args:
39+
config: The configuration dictionary
40+
41+
Returns:
42+
int: The calculated optimal cache size
43+
"""
44+
base_cache_size = config.get("caching", {}).get("cache_size", DEFAULT_CACHE_SIZE)
45+
scaling_factor = config.get("hardware_limits", {}).get("memory_scaling_factor", 0.3)
46+
47+
if scaling_factor:
48+
available_mb = psutil.virtual_memory().available / (1024 * 1024)
49+
dynamic_size = int(available_mb / scaling_factor)
50+
return min(base_cache_size, dynamic_size)
51+
52+
return base_cache_size
53+
54+
55+
class ConfigHasher:
56+
"""
57+
Handles configuration hashing with intelligent cache invalidation.
58+
"""
59+
60+
@staticmethod
61+
def hash_config(
62+
config: Dict[str, Any], salt: str, sections: Optional[list] = None
63+
) -> str:
64+
"""
65+
Create a hash of relevant configuration sections.
66+
67+
Args:
68+
config: Configuration dictionary
69+
salt: Salt value for the hash
70+
sections: Specific sections to include (if None, includes commonly cached sections)
71+
72+
Returns:
73+
str: Hexadecimal hash of the configuration
74+
"""
75+
if sections is None:
76+
sections = [
77+
"stop_words",
78+
"stop_words_add",
79+
"stop_words_exclude",
80+
"text_processing",
81+
"caching",
82+
"validation",
83+
"keyword_categories",
84+
]
85+
86+
relevant_config = {}
87+
for section in sections:
88+
if section in config:
89+
relevant_config[section] = config.get(section)
90+
91+
config_str = json.dumps(relevant_config, sort_keys=True)
92+
return xxhash.xxh3_64(f"{salt}_{config_str}".encode("utf-8")).hexdigest()

cache_utils.py

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Caching utilities for the Keywords4CV application."""
2+
3+
import time
4+
import json
5+
import logging
6+
from typing import Any, Dict, Optional, Tuple, Union, List
7+
from abc import ABC, abstractmethod
8+
from cachetools import LRUCache
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
class CacheBackend(ABC):
14+
"""Abstract base class for cache backends."""
15+
16+
@abstractmethod
17+
def get(self, key: str) -> Any:
18+
"""Get a value from the cache."""
19+
pass
20+
21+
@abstractmethod
22+
def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
23+
"""Set a value in the cache with optional TTL."""
24+
pass
25+
26+
@abstractmethod
27+
def delete(self, key: str) -> None:
28+
"""Delete a value from the cache."""
29+
pass
30+
31+
@abstractmethod
32+
def clear(self) -> None:
33+
"""Clear the cache."""
34+
pass
35+
36+
37+
class MemoryCacheBackend(CacheBackend):
38+
"""In-memory cache backend using LRUCache."""
39+
40+
def __init__(self, maxsize: int = 10000):
41+
"""Initialize the memory cache backend."""
42+
self.cache = LRUCache(maxsize=maxsize)
43+
self.expiry = {} # Separate dict for expiry times
44+
45+
def get(self, key: str) -> Any:
46+
"""
47+
Get a value from the cache, respecting TTL.
48+
49+
Args:
50+
key: The cache key
51+
52+
Returns:
53+
The cached value or None if not found or expired
54+
"""
55+
# Check if key exists and not expired
56+
if key in self.expiry and self.expiry[key] is not None:
57+
if self.expiry[key] < time.time():
58+
# Expired, remove from cache
59+
self.delete(key)
60+
return None
61+
62+
# Get from cache
63+
return self.cache.get(key)
64+
65+
def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
66+
"""
67+
Set a value in the cache with optional TTL.
68+
69+
Args:
70+
key: The cache key
71+
value: The value to cache
72+
ttl: Time-to-live in seconds, or None for no expiry
73+
"""
74+
self.cache[key] = value
75+
76+
# Set expiry time if TTL provided
77+
if ttl is not None:
78+
self.expiry[key] = time.time() + ttl
79+
else:
80+
self.expiry[key] = None
81+
82+
def delete(self, key: str) -> None:
83+
"""
84+
Delete a value from the cache.
85+
86+
Args:
87+
key: The cache key to delete
88+
"""
89+
if key in self.cache:
90+
del self.cache[key]
91+
if key in self.expiry:
92+
del self.expiry[key]
93+
94+
def clear(self) -> None:
95+
"""Clear the cache."""
96+
self.cache.clear()
97+
self.expiry.clear()
98+
99+
100+
class CacheManager:
101+
"""Cache manager that can use different backend implementations."""
102+
103+
def __init__(self, backend: CacheBackend = None, namespace: str = "default"):
104+
"""
105+
Initialize the cache manager.
106+
107+
Args:
108+
backend: The cache backend to use
109+
namespace: Namespace for keys to avoid collisions
110+
"""
111+
self.backend = backend or MemoryCacheBackend()
112+
self.namespace = namespace
113+
114+
def _make_key(self, key: str) -> str:
115+
"""Create a namespaced key."""
116+
return f"{self.namespace}:{key}"
117+
118+
def get(self, key: str) -> Any:
119+
"""Get a value from the cache."""
120+
return self.backend.get(self._make_key(key))
121+
122+
def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
123+
"""Set a value in the cache with optional TTL."""
124+
self.backend.set(self._make_key(key), value, ttl)
125+
126+
def delete(self, key: str) -> None:
127+
"""Delete a value from the cache."""
128+
self.backend.delete(self._make_key(key))
129+
130+
def clear(self) -> None:
131+
"""Clear the cache."""
132+
self.backend.clear()
133+
134+
def get_or_compute(
135+
self, key: str, compute_func, ttl: Optional[int] = None, *args, **kwargs
136+
):
137+
"""
138+
Get a value from cache or compute it if not found.
139+
140+
Args:
141+
key: The cache key
142+
compute_func: Function to compute the value if not in cache
143+
ttl: Time-to-live in seconds
144+
*args: Arguments to pass to compute_func
145+
**kwargs: Keyword arguments to pass to compute_func
146+
147+
Returns:
148+
The cached or computed value
149+
"""
150+
cached = self.get(key)
151+
if cached is not None:
152+
return cached
153+
154+
value = compute_func(*args, **kwargs)
155+
self.set(key, value, ttl)
156+
return value

0 commit comments

Comments
 (0)