Skip to content

Commit eb87090

Browse files
authored
Improve Keyword Extraction and Semantic Analysis (#19)
This commit introduces several enhancements to the ATS optimizer script and configuration: 1. **Configuration Updates**: - Added `semantic_validation: True` to enable semantic filtering of keywords by default. - Increased `similarity_threshold` from 0.6 to 0.65 for stricter semantic categorization. - Adjusted `ngram_range` and `whitelist_ngram_range` from `[1, 3]` to `[1, 2]` to focus on shorter, more precise phrases. 2. **Script Improvements**: - **Entity Ruler Enhancement**: Added whitelisted phrases from `skills_whitelist` as `SKILL` entities in the spaCy pipeline, preserving multi-word skills during tokenization. - **Keyword Extraction**: - Updated `_process_doc_tokens` to prioritize `SKILL` entities, ensuring they are preserved as whole phrases before tokenizing remaining text. - Improved `_generate_ngrams` to filter out single-letter tokens and stop words, enhancing TF-IDF accuracy. - Refined `extract_keywords` to integrate `SKILL` entity extraction with regular tokenization, combining both for robust keyword lists. - **TF-IDF Matrix**: - Enhanced `_create_tfidf_matrix` to validate keyword sets, removing invalid tokens (e.g., single-letter words) before vectorization. - Added debug logging for validated keyword sets. - **Model Loading**: Added `sentencizer` to the spaCy pipeline in `_try_load_model` for consistent sentence boundary detection. - **Minor Fixes**: Removed unused `Pool` import from multiprocessing, streamlined imports. These changes improve keyword precision, preserve critical multi-word phrases, and enhance semantic analysis, making the optimizer more effective for ATS systems.
1 parent 97c7ba1 commit eb87090

File tree

2 files changed

+106
-100
lines changed

2 files changed

+106
-100
lines changed

config.yaml

+11-12
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,17 @@ max_job_descriptions: 100
66
default_category_name: "Other"
77
min_desc_length: 50
88
min_jobs: 2
9+
semantic_validation: True
10+
similarity_threshold: 0.65
11+
ngram_range: [1,2]
12+
whitelist_ngram_range: [1,2]
13+
spacy_model: "en_core_web_sm"
14+
cache_size: 1000
915

16+
weighting:
17+
tfidf_weight: 0.7
18+
frequency_weight: 0.3
19+
whitelist_boost: 1.5
1020

1121
section_headings:
1222
- responsibilities
@@ -885,15 +895,4 @@ keyword_categories:
885895
- product lifecycle
886896
- requirements management
887897

888-
Other: [] # Keywords that don't neatly fit into the above categories. Customize as needed.
889-
890-
ngram_range: [1, 3]
891-
similarity_threshold: 0.6
892-
spacy_model: "en_core_web_sm"
893-
cache_size: 1000
894-
whitelist_ngram_range: [1, 3]
895-
896-
weighting:
897-
tfidf_weight: 0.7
898-
frequency_weight: 0.3
899-
whitelist_boost: 1.5
898+
Other: [] # Keywords that don't neatly fit into the above categories. Customize as needed.

keywords4cv.py

+95-88
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from collections import OrderedDict # For creating an LRU cache
88
from typing import Dict, List, Set, Tuple, NamedTuple, Optional # For type hinting
99
from multiprocessing import (
10-
Pool,
1110
TimeoutError as MPTimeoutError,
1211
) # For parallel processing
1312
import concurrent.futures # For multithreading category vector calculation
@@ -203,21 +202,18 @@ def preprocess_batch(self, texts: List[str]) -> List[str]:
203202
] # Preprocess each text in the list
204203

205204
def _process_doc_tokens(self, doc):
206-
"""Helper function to process tokens from a single spaCy doc.
207-
208-
Args:
209-
doc: A spaCy Doc object.
210-
211-
Returns:
212-
A list of lemmatized tokens.
213-
"""
214205
tokens = []
215-
for token in doc:
216-
if (
217-
token.text in self.stop_words
218-
or len(token.text) <= 1
219-
or token.text.isnumeric()
220-
):
206+
skill_spans = []
207+
# First, add SKILL entity texts as tokens and record their spans.
208+
for ent in doc.ents:
209+
if ent.label_ == "SKILL":
210+
tokens.append(ent.text)
211+
skill_spans.append((ent.start, ent.end))
212+
# Process remaining tokens that are not part of a SKILL entity.
213+
for i, token in enumerate(doc):
214+
if any(start <= i < end for start, end in skill_spans):
215+
continue
216+
if token.text.lower() in self.stop_words or len(token.text) <= 1 or token.text.isnumeric():
221217
continue
222218
try:
223219
lemma = token.lemma_.lower().strip()
@@ -354,71 +350,65 @@ def _generate_synonyms(self, skills: List[str]) -> Set[str]:
354350
)
355351
return synonyms # Return the set of synonyms
356352

353+
# --- Improved _generate_ngrams function ---
357354
def _generate_ngrams(self, tokens: List[str], n: int) -> Set[str]:
358-
"""Generate n-grams from a list of tokens.
359-
360-
Args:
361-
tokens: A list of tokens.
362-
n: The length of the n-grams.
363-
364-
Returns:
365-
A set of n-grams.
355+
"""Generate n-grams from tokens and exclude any n-grams that contain single-letter words.
356+
357+
This extra filter ensures that even if a single-letter slips through tokenization,
358+
the n-grams used for TF-IDF contain only valid multi-character words.
366359
"""
360+
# Remove any tokens that are empty after stripping whitespace.
367361
filtered_tokens = [
368-
token for token in tokens if token.strip()
369-
] # Filter out empty tokens
370-
return {
362+
token for token in tokens
363+
if token.strip() and len(token.strip()) > 1 and token not in self.preprocessor.stop_words
364+
]
365+
if len(filtered_tokens) < n:
366+
return set()
367+
# Generate all n-grams from the filtered tokens.
368+
ngrams = {
371369
" ".join(filtered_tokens[i : i + n])
372370
for i in range(len(filtered_tokens) - n + 1)
373-
} # Generate and return n-grams
371+
if all(len(word.strip()) > 1 for word in filtered_tokens[i : i + n])
372+
}
373+
return ngrams
374374

375375
def extract_keywords(self, texts: List[str]) -> List[List[str]]:
376-
"""Extract keywords from a list of text strings.
377-
378-
Args:
379-
texts: A list of text strings.
380-
381-
Returns:
382-
A list of lists of keywords.
383-
"""
384-
cleaned = self.preprocessor.preprocess_batch(
385-
texts
386-
) # Preprocess the text strings
387-
tokenized = self.preprocessor.tokenize_batch(
388-
cleaned
389-
) # Tokenize the text strings
390-
all_keywords = [] # Initialize the list of keywords
391-
for tokens in tokenized: # Iterate through the tokenized text strings
392-
keywords = (
393-
set()
394-
) # Initialize the set of keywords for the current text string
395-
min_n = min(
396-
self.whitelist_ngram_range[0], self.ngram_range[0]
397-
) # Get the minimum n-gram length
398-
max_n = max(
399-
self.whitelist_ngram_range[1], self.ngram_range[1]
400-
) # Get the maximum n-gram length
401-
for n in range(min_n, max_n + 1): # Iterate through the n-gram lengths
402-
ngrams = self._generate_ngrams(
403-
tokens, n
404-
) # Generate n-grams from the tokens
405-
if (
406-
n >= self.whitelist_ngram_range[0]
407-
and n <= self.whitelist_ngram_range[1]
408-
) or (
409-
n >= self.ngram_range[0] and n <= self.ngram_range[1]
410-
): # Check if the n-gram length is within the valid range
411-
keywords.update(ngrams) # Add the n-grams to the set of keywords
412-
all_keywords.append(
413-
list(keywords)
414-
) # Add the list of keywords to the list of all keywords
415-
if self.config.get(
416-
"semantic_validation", False
417-
): # Check if semantic validation is enabled
418-
return self._semantic_filter(
419-
all_keywords, texts
420-
) # Apply semantic filtering
421-
return all_keywords # Return the list of all keywords
376+
all_keywords = []
377+
for text in texts:
378+
doc = self.nlp(text)
379+
# Extract preserved SKILL entities.
380+
entity_keywords = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
381+
382+
# Exclude SKILL entities from regular tokenization.
383+
skill_spans = [(ent.start, ent.end) for ent in doc.ents if ent.label_ == "SKILL"]
384+
non_entity_tokens = []
385+
for i, token in enumerate(doc):
386+
if any(start <= i < end for start, end in skill_spans):
387+
continue
388+
non_entity_tokens.append(token.text)
389+
390+
# Preprocess and split to get cleaned tokens.
391+
preprocessed_text = self.preprocessor.preprocess(" ".join(non_entity_tokens))
392+
token_list = preprocessed_text.split()
393+
394+
non_entity_keywords = set()
395+
# Generate n-grams only for non-entity tokens.
396+
for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
397+
non_entity_keywords.update(self._generate_ngrams(token_list, n))
398+
399+
# Combine entity and non-entity keywords.
400+
keywords = set(entity_keywords) | non_entity_keywords
401+
filtered_keywords = [
402+
kw for kw in keywords
403+
if len(kw.strip()) > 1
404+
and not any(len(w.strip()) <= 1 for w in kw.split())
405+
and not all(w in self.preprocessor.stop_words for w in kw.split())
406+
]
407+
all_keywords.append(filtered_keywords)
408+
409+
if self.config.get("semantic_validation", False):
410+
return self._semantic_filter(all_keywords, texts)
411+
return all_keywords
422412

423413
def _semantic_filter(
424414
self, keyword_lists: List[List[str]], texts: List[str]
@@ -610,6 +600,11 @@ def __init__(self, config_path: str = "config.yaml"):
610600
"spaCy version <3.0 may have compatibility issues"
611601
) # Warn if the spaCy version is too low
612602

603+
whitelisted_phrases = self.config.get("skills_whitelist", [])
604+
if whitelisted_phrases and "entity_ruler" in self.nlp.pipe_names:
605+
patterns = [{"label": "SKILL", "pattern": phrase} for phrase in whitelisted_phrases]
606+
self.nlp.get_pipe("entity_ruler").add_patterns(patterns)
607+
613608
def _add_entity_ruler(self):
614609
"""Add an EntityRuler to the spaCy pipeline for section detection.
615610
@@ -846,6 +841,8 @@ def _try_load_model(self, model_name):
846841
nlp = spacy.load(model_name, disable=["parser", "ner"])
847842
if "lemmatizer" not in nlp.pipe_names:
848843
nlp.add_pipe("lemmatizer", config={"mode": "rule"})
844+
if "sentencizer" not in nlp.pipe_names:
845+
nlp.add_pipe("sentencizer")
849846
return nlp
850847
except OSError:
851848
return None
@@ -884,16 +881,16 @@ def _load_and_configure_spacy_model(self):
884881
for attempt in range(retry_attempts + 1):
885882
nlp = self._try_load_model(model)
886883
if nlp:
884+
logger.debug(f"Loaded spaCy pipeline: {nlp.pipe_names}")
887885
return nlp
888886
logger.warning(
889887
f"Model '{model}' not found. Attempt {attempt + 1}/{retry_attempts + 1}"
890888
)
891-
if (
892-
attempt < retry_attempts and model == model_name
893-
): # Only download the specified model
889+
if attempt < retry_attempts and model == model_name:
894890
if self._download_model(model):
895891
nlp = self._try_load_model(model)
896892
if nlp:
893+
logger.debug(f"Loaded spaCy pipeline: {nlp.pipe_names}")
897894
return nlp
898895
time.sleep(2) # Add a delay of 2 seconds between retries
899896

@@ -926,31 +923,41 @@ def _calculate_scores(self, dtm, feature_names, keyword_sets, job_descriptions):
926923
})
927924
return results
928925

926+
# --- Improved _create_tfidf_matrix function ---
929927
def _create_tfidf_matrix(self, texts, keyword_sets):
930-
"""Create and fit the TF-IDF vectorizer and transform the keyword sets.
931-
932-
Args:
933-
texts: List of (preprocessed) job description texts.
934-
keyword_sets: List of lists of keywords (one list per job description).
928+
"""Create and return a TF-IDF matrix for the job descriptions using pre-validated keyword sets.
935929
936-
Returns:
937-
A tuple containing:
938-
- The document-term matrix (sparse matrix).
939-
- The list of feature names
930+
The vectorizer is adjusted to treat each document as a pre-tokenized list,
931+
preserving multi-word keywords without re-tokenization. As an extra safety
932+
measure, we also filter each document's keyword set to remove invalid tokens.
940933
"""
941934
max_features = self.config.get("tfidf_max_features", 10000)
942935
vectorizer = TfidfVectorizer(
943936
ngram_range=self.keyword_extractor.ngram_range,
944937
lowercase=False,
938+
# Use identity functions so that input is treated as pre-tokenized lists.
945939
tokenizer=lambda x: x,
946940
preprocessor=lambda x: x,
947941
max_features=max_features,
948942
dtype=np.float32,
949943
)
950-
dtm = vectorizer.fit_transform([" ".join(kw) for kw in keyword_sets])
951-
if len(vectorizer.get_feature_names_out()) == max_features:
944+
# Validate each document's keyword set to discard any term that includes a word of length <=1.
945+
validated_sets = [
946+
[kw for kw in kw_set if all(len(word) > 1 for word in kw.split())]
947+
for kw_set in keyword_sets
948+
]
949+
logger.debug(f"Validated keyword sets sample: {validated_sets[:2]}")
950+
try:
951+
dtm = vectorizer.fit_transform(validated_sets)
952+
except ValueError as e:
953+
logger.error(f"TF-IDF vectorization failed: {e}. Check keyword_sets content.")
954+
return None, []
955+
feature_names = vectorizer.get_feature_names_out()
956+
if len(feature_names) == max_features:
952957
logger.warning(f"TF-IDF vocabulary reached the limit of {max_features} features")
953-
return dtm, vectorizer.get_feature_names_out()
958+
if not feature_names.size:
959+
logger.warning("No features extracted by TF-IDF. Check input keyword sets.")
960+
return dtm, feature_names
954961

955962
def analyze_jobs(self, job_descriptions: Dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
956963
"""Analyze job descriptions and extract keywords.

0 commit comments

Comments
 (0)