DavidOsipov
diff --git a/‎_generate_ngrams.py
+32 b/‎_generate_ngrams.py
+32
diff --git a/‎bk_tree_enhancement.py
+87 b/‎bk_tree_enhancement.py
+87
diff --git a/‎cache_manager.py
+92 b/‎cache_manager.py
+92
diff --git a/‎cache_utils.py
+156 b/‎cache_utils.py
+156
@@ -0,0 +1,32 @@
+def _generate_ngrams(self, tokens: List[str], n: int) -> Set[str]:
+    """
+    Generate n-grams from a list of tokens.
+
+    Args:
+        tokens: List of tokens to generate n-grams from
+        n: Size of n-grams to generate
+
+    Returns:
+        Set[str]: Set of generated n-grams
+
+    Raises:
+        ValueError: If n is not a positive integer
+    """
+    if not isinstance(n, int) or n <= 0:
+        raise ValueError(f"Invalid ngram size: {n}. Must be positive integer")
+
+    filtered_tokens = [
+        token
+        for token in tokens
+        if len(token.strip()) > 1 and token not in self.preprocessor.stop_words
+    ]
+
+    if len(filtered_tokens) < n:
+        return set()
+
+    # Simplified - removed redundant check since filtered_tokens already ensures len(t) > 1
+    ngrams = {
+        " ".join(filtered_tokens[i : i + n])
+        for i in range(len(filtered_tokens) - (n - 1))
+    }
+    return ngrams
@@ -0,0 +1,87 @@
+"""
+Enhanced BK-Tree implementation with optimized fuzzy matching and caching.
+Can be imported by keywords4cv.py to replace the default BK-Tree implementation.
+"""
+
+import logging
+from typing import List, Tuple, Set, Optional, Dict, Any
+import pybktree
+from Levenshtein import distance
+from rapidfuzz import fuzz
+from cachetools import LRUCache
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+class EnhancedBKTree:
+    """Enhanced BK-Tree with optimized fuzzy matching and caching support."""
+
+    def __init__(self, items: List[str], cache_size: int = 1000):
+        """Initialize the enhanced BK-Tree with the provided items."""
+        self.bk_tree = pybktree.BKTree(distance, items) if items else None
+        self.cache = LRUCache(maxsize=cache_size)
+        self._query_count = 0
+        self._hit_count = 0
+
+    def find(
+        self, query: str, threshold: int, limit: Optional[int] = None
+    ) -> List[Tuple[int, str]]:
+        """
+        Find items within a certain Levenshtein distance threshold of the query string.
+
+        Args:
+            query: String to search for
+            threshold: Maximum Levenshtein distance allowed
+            limit: Maximum number of results to return (optional)
+
+        Returns:
+            List of (distance, item) pairs sorted by distance
+        """
+        if not self.bk_tree:
+            return []
+
+        self._query_count += 1
+        cache_key = f"{query}_{threshold}_{limit}"
+
+        # Check cache first
+        if cache_key in self.cache:
+            self._hit_count += 1
+            return self.cache[cache_key]
+
+        # Not in cache, perform the actual query
+        try:
+            # Get all matches within threshold
+            matches = self.bk_tree.find(query, threshold)
+
+            # Sort by distance (should already be sorted, but making sure)
+            matches.sort(key=lambda x: x[0])
+
+            # Apply limit if specified
+            if limit is not None and limit > 0:
+                matches = matches[:limit]
+
+            # Cache the results
+            self.cache[cache_key] = matches
+            return matches
+
+        except Exception as e:
+            logger.error(f"Error in BK-Tree search for '{query}': {e}")
+            return []
+
+    def get_hit_rate(self) -> float:
+        """Return the cache hit rate as a percentage."""
+        if self._query_count == 0:
+            return 0.0
+        return (self._hit_count / self._query_count) * 100
+
+    def get_stats(self) -> Dict[str, Any]:
+        """Return statistics about the tree and cache usage."""
+        return {
+            "tree_size": len(self.bk_tree.tree) if self.bk_tree else 0,
+            "cache_size": len(self.cache),
+            "cache_maxsize": self.cache.maxsize,
+            "queries": self._query_count,
+            "hits": self._hit_count,
+            "hit_rate": self.get_hit_rate(),
+        }
@@ -0,0 +1,92 @@
+"""
+Cache management utilities for Keywords4CV.
+"""
+
+import json
+import os
+import xxhash
+import psutil
+from typing import Dict, Any, Optional
+from cachetools import LRUCache
+
+# Constants
+DEFAULT_CACHE_SIZE = 5000
+DEFAULT_CACHE_SALT = "default_secret_salt"
+CACHE_VERSION = "1.0.0"
+
+
+def get_cache_salt(config: Dict[str, Any]) -> str:
+    """
+    Retrieves the cache salt, prioritizing environment variables, then config, then a default.
+
+    Args:
+        config: The configuration dictionary
+
+    Returns:
+        str: The cache salt to use for hashing operations
+    """
+    return os.environ.get(
+        "K4CV_CACHE_SALT",
+        config.get("caching", {}).get("cache_salt", DEFAULT_CACHE_SALT),
+    )
+
+
+def calculate_optimal_cache_size(config: Dict[str, Any]) -> int:
+    """
+    Calculate the optimal cache size based on available memory and configuration.
+
+    Args:
+        config: The configuration dictionary
+
+    Returns:
+        int: The calculated optimal cache size
+    """
+    base_cache_size = config.get("caching", {}).get("cache_size", DEFAULT_CACHE_SIZE)
+    scaling_factor = config.get("hardware_limits", {}).get("memory_scaling_factor", 0.3)
+
+    if scaling_factor:
+        available_mb = psutil.virtual_memory().available / (1024 * 1024)
+        dynamic_size = int(available_mb / scaling_factor)
+        return min(base_cache_size, dynamic_size)
+
+    return base_cache_size
+
+
+class ConfigHasher:
+    """
+    Handles configuration hashing with intelligent cache invalidation.
+    """
+
+    @staticmethod
+    def hash_config(
+        config: Dict[str, Any], salt: str, sections: Optional[list] = None
+    ) -> str:
+        """
+        Create a hash of relevant configuration sections.
+
+        Args:
+            config: Configuration dictionary
+            salt: Salt value for the hash
+            sections: Specific sections to include (if None, includes commonly cached sections)
+
+        Returns:
+            str: Hexadecimal hash of the configuration
+        """
+        if sections is None:
+            sections = [
+                "stop_words",
+                "stop_words_add",
+                "stop_words_exclude",
+                "text_processing",
+                "caching",
+                "validation",
+                "keyword_categories",
+            ]
+
+        relevant_config = {}
+        for section in sections:
+            if section in config:
+                relevant_config[section] = config.get(section)
+
+        config_str = json.dumps(relevant_config, sort_keys=True)
+        return xxhash.xxh3_64(f"{salt}_{config_str}".encode("utf-8")).hexdigest()
@@ -0,0 +1,156 @@
+"""Caching utilities for the Keywords4CV application."""
+
+import time
+import json
+import logging
+from typing import Any, Dict, Optional, Tuple, Union, List
+from abc import ABC, abstractmethod
+from cachetools import LRUCache
+
+logger = logging.getLogger(__name__)
+
+
+class CacheBackend(ABC):
+    """Abstract base class for cache backends."""
+
+    @abstractmethod
+    def get(self, key: str) -> Any:
+        """Get a value from the cache."""
+        pass
+
+    @abstractmethod
+    def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
+        """Set a value in the cache with optional TTL."""
+        pass
+
+    @abstractmethod
+    def delete(self, key: str) -> None:
+        """Delete a value from the cache."""
+        pass
+
+    @abstractmethod
+    def clear(self) -> None:
+        """Clear the cache."""
+        pass
+
+
+class MemoryCacheBackend(CacheBackend):
+    """In-memory cache backend using LRUCache."""
+
+    def __init__(self, maxsize: int = 10000):
+        """Initialize the memory cache backend."""
+        self.cache = LRUCache(maxsize=maxsize)
+        self.expiry = {}  # Separate dict for expiry times
+
+    def get(self, key: str) -> Any:
+        """
+        Get a value from the cache, respecting TTL.
+
+        Args:
+            key: The cache key
+
+        Returns:
+            The cached value or None if not found or expired
+        """
+        # Check if key exists and not expired
+        if key in self.expiry and self.expiry[key] is not None:
+            if self.expiry[key] < time.time():
+                # Expired, remove from cache
+                self.delete(key)
+                return None
+
+        # Get from cache
+        return self.cache.get(key)
+
+    def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
+        """
+        Set a value in the cache with optional TTL.
+
+        Args:
+            key: The cache key
+            value: The value to cache
+            ttl: Time-to-live in seconds, or None for no expiry
+        """
+        self.cache[key] = value
+
+        # Set expiry time if TTL provided
+        if ttl is not None:
+            self.expiry[key] = time.time() + ttl
+        else:
+            self.expiry[key] = None
+
+    def delete(self, key: str) -> None:
+        """
+        Delete a value from the cache.
+
+        Args:
+            key: The cache key to delete
+        """
+        if key in self.cache:
+            del self.cache[key]
+        if key in self.expiry:
+            del self.expiry[key]
+
+    def clear(self) -> None:
+        """Clear the cache."""
+        self.cache.clear()
+        self.expiry.clear()
+
+
+class CacheManager:
+    """Cache manager that can use different backend implementations."""
+
+    def __init__(self, backend: CacheBackend = None, namespace: str = "default"):
+        """
+        Initialize the cache manager.
+
+        Args:
+            backend: The cache backend to use
+            namespace: Namespace for keys to avoid collisions
+        """
+        self.backend = backend or MemoryCacheBackend()
+        self.namespace = namespace
+
+    def _make_key(self, key: str) -> str:
+        """Create a namespaced key."""
+        return f"{self.namespace}:{key}"
+
+    def get(self, key: str) -> Any:
+        """Get a value from the cache."""
+        return self.backend.get(self._make_key(key))
+
+    def set(self, key: str, value: Any, ttl: Optional[int] = None) -> None:
+        """Set a value in the cache with optional TTL."""
+        self.backend.set(self._make_key(key), value, ttl)
+
+    def delete(self, key: str) -> None:
+        """Delete a value from the cache."""
+        self.backend.delete(self._make_key(key))
+
+    def clear(self) -> None:
+        """Clear the cache."""
+        self.backend.clear()
+
+    def get_or_compute(
+        self, key: str, compute_func, ttl: Optional[int] = None, *args, **kwargs
+    ):
+        """
+        Get a value from cache or compute it if not found.
+
+        Args:
+            key: The cache key
+            compute_func: Function to compute the value if not in cache
+            ttl: Time-to-live in seconds
+            *args: Arguments to pass to compute_func
+            **kwargs: Keyword arguments to pass to compute_func
+
+        Returns:
+            The cached or computed value
+        """
+        cached = self.get(key)
+        if cached is not None:
+            return cached
+
+        value = compute_func(*args, **kwargs)
+        self.set(key, value, ttl)
+        return value