Improve Keyword Extraction and Semantic Analysis (#19)

DavidOsipov · web-flow · commit eb87090d473a · 2025-02-21T13:22:16.000+04:00
This commit introduces several enhancements to the ATS optimizer script and configuration:

1. **Configuration Updates**:
   - Added `semantic_validation: True` to enable semantic filtering of keywords by default.
   - Increased `similarity_threshold` from 0.6 to 0.65 for stricter semantic categorization.
   - Adjusted `ngram_range` and `whitelist_ngram_range` from `[1, 3]` to `[1, 2]` to focus on shorter, more precise phrases.

2. **Script Improvements**:
   - **Entity Ruler Enhancement**: Added whitelisted phrases from `skills_whitelist` as `SKILL` entities in the spaCy pipeline, preserving multi-word skills during tokenization.
   - **Keyword Extraction**:
     - Updated `_process_doc_tokens` to prioritize `SKILL` entities, ensuring they are preserved as whole phrases before tokenizing remaining text.
     - Improved `_generate_ngrams` to filter out single-letter tokens and stop words, enhancing TF-IDF accuracy.
     - Refined `extract_keywords` to integrate `SKILL` entity extraction with regular tokenization, combining both for robust keyword lists.
   - **TF-IDF Matrix**:
     - Enhanced `_create_tfidf_matrix` to validate keyword sets, removing invalid tokens (e.g., single-letter words) before vectorization.
     - Added debug logging for validated keyword sets.
   - **Model Loading**: Added `sentencizer` to the spaCy pipeline in `_try_load_model` for consistent sentence boundary detection.
   - **Minor Fixes**: Removed unused `Pool` import from multiprocessing, streamlined imports.

These changes improve keyword precision, preserve critical multi-word phrases, and enhance semantic analysis, making the optimizer more effective for ATS systems.
diff --git a/config.yaml b/config.yaml
@@ -6,7 +6,17 @@ max_job_descriptions: 100
 default_category_name: "Other"
 min_desc_length: 50
 min_jobs: 2
+semantic_validation: True
+similarity_threshold: 0.65
+ngram_range: [1,2]
+whitelist_ngram_range: [1,2]
+spacy_model: "en_core_web_sm"
+cache_size: 1000
 
+weighting:
+  tfidf_weight: 0.7
+  frequency_weight: 0.3
+  whitelist_boost: 1.5
 
 section_headings:
   - responsibilities
@@ -885,15 +895,4 @@ keyword_categories:
     - product lifecycle
     - requirements management
 
-  Other: []  # Keywords that don't neatly fit into the above categories. Customize as needed.
-
-ngram_range: [1, 3]
-similarity_threshold: 0.6
-spacy_model: "en_core_web_sm"
-cache_size: 1000
-whitelist_ngram_range: [1, 3]
-
-weighting:
-  tfidf_weight: 0.7
-  frequency_weight: 0.3
-  whitelist_boost: 1.5
+  Other: []  # Keywords that don't neatly fit into the above categories. Customize as needed.
diff --git a/keywords4cv.py b/keywords4cv.py
@@ -7,7 +7,6 @@
 from collections import OrderedDict  # For creating an LRU cache
 from typing import Dict, List, Set, Tuple, NamedTuple, Optional  # For type hinting
 from multiprocessing import (
-    Pool,
     TimeoutError as MPTimeoutError,
 )  # For parallel processing
 import concurrent.futures  # For multithreading category vector calculation
@@ -203,21 +202,18 @@ def preprocess_batch(self, texts: List[str]) -> List[str]:
         ]  # Preprocess each text in the list
 
     def _process_doc_tokens(self, doc):
-        """Helper function to process tokens from a single spaCy doc.
-
-        Args:
-            doc: A spaCy Doc object.
-
-        Returns:
-            A list of lemmatized tokens.
-        """
         tokens = []
-        for token in doc:
-            if (
-                token.text in self.stop_words
-                or len(token.text) <= 1
-                or token.text.isnumeric()
-            ):
+        skill_spans = []
+        # First, add SKILL entity texts as tokens and record their spans.
+        for ent in doc.ents:
+            if ent.label_ == "SKILL":
+                tokens.append(ent.text)
+                skill_spans.append((ent.start, ent.end))
+        # Process remaining tokens that are not part of a SKILL entity.
+        for i, token in enumerate(doc):
+            if any(start <= i < end for start, end in skill_spans):
+                continue
+            if token.text.lower() in self.stop_words or len(token.text) <= 1 or token.text.isnumeric():
                 continue
             try:
                 lemma = token.lemma_.lower().strip()
@@ -354,71 +350,65 @@ def _generate_synonyms(self, skills: List[str]) -> Set[str]:
                     )
         return synonyms  # Return the set of synonyms
 
+    # --- Improved _generate_ngrams function ---
     def _generate_ngrams(self, tokens: List[str], n: int) -> Set[str]:
-        """Generate n-grams from a list of tokens.
-
-        Args:
-            tokens: A list of tokens.
-            n: The length of the n-grams.
-
-        Returns:
-            A set of n-grams.
+        """Generate n-grams from tokens and exclude any n-grams that contain single-letter words.
+        
+        This extra filter ensures that even if a single-letter slips through tokenization,
+        the n-grams used for TF-IDF contain only valid multi-character words.
         """
+        # Remove any tokens that are empty after stripping whitespace.
         filtered_tokens = [
-            token for token in tokens if token.strip()
-        ]  # Filter out empty tokens
-        return {
+            token for token in tokens
+            if token.strip() and len(token.strip()) > 1 and token not in self.preprocessor.stop_words
+        ]
+        if len(filtered_tokens) < n:
+            return set()
+        # Generate all n-grams from the filtered tokens.
+        ngrams = {
             " ".join(filtered_tokens[i : i + n])
             for i in range(len(filtered_tokens) - n + 1)
-        }  # Generate and return n-grams
+            if all(len(word.strip()) > 1 for word in filtered_tokens[i : i + n])
+        }
+        return ngrams
 
     def extract_keywords(self, texts: List[str]) -> List[List[str]]:
-        """Extract keywords from a list of text strings.
-
-        Args:
-            texts: A list of text strings.
-
-        Returns:
-            A list of lists of keywords.
-        """
-        cleaned = self.preprocessor.preprocess_batch(
-            texts
-        )  # Preprocess the text strings
-        tokenized = self.preprocessor.tokenize_batch(
-            cleaned
-        )  # Tokenize the text strings
-        all_keywords = []  # Initialize the list of keywords
-        for tokens in tokenized:  # Iterate through the tokenized text strings
-            keywords = (
-                set()
-            )  # Initialize the set of keywords for the current text string
-            min_n = min(
-                self.whitelist_ngram_range[0], self.ngram_range[0]
-            )  # Get the minimum n-gram length
-            max_n = max(
-                self.whitelist_ngram_range[1], self.ngram_range[1]
-            )  # Get the maximum n-gram length
-            for n in range(min_n, max_n + 1):  # Iterate through the n-gram lengths
-                ngrams = self._generate_ngrams(
-                    tokens, n
-                )  # Generate n-grams from the tokens
-                if (
-                    n >= self.whitelist_ngram_range[0]
-                    and n <= self.whitelist_ngram_range[1]
-                ) or (
-                    n >= self.ngram_range[0] and n <= self.ngram_range[1]
-                ):  # Check if the n-gram length is within the valid range
-                    keywords.update(ngrams)  # Add the n-grams to the set of keywords
-            all_keywords.append(
-                list(keywords)
-            )  # Add the list of keywords to the list of all keywords
-        if self.config.get(
-            "semantic_validation", False
-        ):  # Check if semantic validation is enabled
-            return self._semantic_filter(
-                all_keywords, texts
-            )  # Apply semantic filtering
-        return all_keywords  # Return the list of all keywords
+        all_keywords = []
+        for text in texts:
+            doc = self.nlp(text)
+            # Extract preserved SKILL entities.
+            entity_keywords = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
+            
+            # Exclude SKILL entities from regular tokenization.
+            skill_spans = [(ent.start, ent.end) for ent in doc.ents if ent.label_ == "SKILL"]
+            non_entity_tokens = []
+            for i, token in enumerate(doc):
+                if any(start <= i < end for start, end in skill_spans):
+                    continue
+                non_entity_tokens.append(token.text)
+            
+            # Preprocess and split to get cleaned tokens.
+            preprocessed_text = self.preprocessor.preprocess(" ".join(non_entity_tokens))
+            token_list = preprocessed_text.split()
+            
+            non_entity_keywords = set()
+            # Generate n-grams only for non-entity tokens.
+            for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
+                non_entity_keywords.update(self._generate_ngrams(token_list, n))
+            
+            # Combine entity and non-entity keywords.
+            keywords = set(entity_keywords) | non_entity_keywords
+            filtered_keywords = [
+                kw for kw in keywords
+                if len(kw.strip()) > 1
+                and not any(len(w.strip()) <= 1 for w in kw.split())
+                and not all(w in self.preprocessor.stop_words for w in kw.split())
+            ]
+            all_keywords.append(filtered_keywords)
+        
+        if self.config.get("semantic_validation", False):
+            return self._semantic_filter(all_keywords, texts)
+        return all_keywords
 
     def _semantic_filter(
         self, keyword_lists: List[List[str]], texts: List[str]
@@ -610,6 +600,11 @@ def __init__(self, config_path: str = "config.yaml"):
                 "spaCy version <3.0 may have compatibility issues"
             )  # Warn if the spaCy version is too low
 
+        whitelisted_phrases = self.config.get("skills_whitelist", [])
+        if whitelisted_phrases and "entity_ruler" in self.nlp.pipe_names:
+            patterns = [{"label": "SKILL", "pattern": phrase} for phrase in whitelisted_phrases]
+            self.nlp.get_pipe("entity_ruler").add_patterns(patterns)
+
     def _add_entity_ruler(self):
         """Add an EntityRuler to the spaCy pipeline for section detection.
 
@@ -846,6 +841,8 @@ def _try_load_model(self, model_name):
             nlp = spacy.load(model_name, disable=["parser", "ner"])
             if "lemmatizer" not in nlp.pipe_names:
                 nlp.add_pipe("lemmatizer", config={"mode": "rule"})
+            if "sentencizer" not in nlp.pipe_names:
+                nlp.add_pipe("sentencizer")
             return nlp
         except OSError:
             return None
@@ -884,16 +881,16 @@ def _load_and_configure_spacy_model(self):
             for attempt in range(retry_attempts + 1):
                 nlp = self._try_load_model(model)
                 if nlp:
+                    logger.debug(f"Loaded spaCy pipeline: {nlp.pipe_names}")
                     return nlp
                 logger.warning(
                     f"Model '{model}' not found. Attempt {attempt + 1}/{retry_attempts + 1}"
                 )
-                if (
-                    attempt < retry_attempts and model == model_name
-                ):  # Only download the specified model
+                if attempt < retry_attempts and model == model_name:
                     if self._download_model(model):
                         nlp = self._try_load_model(model)
                         if nlp:
+                            logger.debug(f"Loaded spaCy pipeline: {nlp.pipe_names}")
                             return nlp
                 time.sleep(2)  # Add a delay of 2 seconds between retries
 
@@ -926,31 +923,41 @@ def _calculate_scores(self, dtm, feature_names, keyword_sets, job_descriptions):
             })
         return results
 
+    # --- Improved _create_tfidf_matrix function ---
     def _create_tfidf_matrix(self, texts, keyword_sets):
-        """Create and fit the TF-IDF vectorizer and transform the keyword sets.
-
-        Args:
-            texts: List of (preprocessed) job description texts.
-            keyword_sets: List of lists of keywords (one list per job description).
+        """Create and return a TF-IDF matrix for the job descriptions using pre-validated keyword sets.
 
-        Returns:
-            A tuple containing:
-                - The document-term matrix (sparse matrix).
-                - The list of feature names
+        The vectorizer is adjusted to treat each document as a pre-tokenized list,
+        preserving multi-word keywords without re-tokenization. As an extra safety
+        measure, we also filter each document's keyword set to remove invalid tokens.
         """
         max_features = self.config.get("tfidf_max_features", 10000)
         vectorizer = TfidfVectorizer(
             ngram_range=self.keyword_extractor.ngram_range,
             lowercase=False,
+            # Use identity functions so that input is treated as pre-tokenized lists.
             tokenizer=lambda x: x,
             preprocessor=lambda x: x,
             max_features=max_features,
             dtype=np.float32,
         )
-        dtm = vectorizer.fit_transform([" ".join(kw) for kw in keyword_sets])
-        if len(vectorizer.get_feature_names_out()) == max_features:
+        # Validate each document's keyword set to discard any term that includes a word of length <=1.
+        validated_sets = [
+            [kw for kw in kw_set if all(len(word) > 1 for word in kw.split())]
+            for kw_set in keyword_sets
+        ]
+        logger.debug(f"Validated keyword sets sample: {validated_sets[:2]}")
+        try:
+            dtm = vectorizer.fit_transform(validated_sets)
+        except ValueError as e:
+            logger.error(f"TF-IDF vectorization failed: {e}. Check keyword_sets content.")
+            return None, []
+        feature_names = vectorizer.get_feature_names_out()
+        if len(feature_names) == max_features:
             logger.warning(f"TF-IDF vocabulary reached the limit of {max_features} features")
-        return dtm, vectorizer.get_feature_names_out()
+        if not feature_names.size:
+            logger.warning("No features extracted by TF-IDF. Check input keyword sets.")
+        return dtm, feature_names
 
     def analyze_jobs(self, job_descriptions: Dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """Analyze job descriptions and extract keywords.