|
7 | 7 | from collections import OrderedDict # For creating an LRU cache
|
8 | 8 | from typing import Dict, List, Set, Tuple, NamedTuple, Optional # For type hinting
|
9 | 9 | from multiprocessing import (
|
10 |
| - Pool, |
11 | 10 | TimeoutError as MPTimeoutError,
|
12 | 11 | ) # For parallel processing
|
13 | 12 | import concurrent.futures # For multithreading category vector calculation
|
@@ -203,21 +202,18 @@ def preprocess_batch(self, texts: List[str]) -> List[str]:
|
203 | 202 | ] # Preprocess each text in the list
|
204 | 203 |
|
205 | 204 | def _process_doc_tokens(self, doc):
|
206 |
| - """Helper function to process tokens from a single spaCy doc. |
207 |
| -
|
208 |
| - Args: |
209 |
| - doc: A spaCy Doc object. |
210 |
| -
|
211 |
| - Returns: |
212 |
| - A list of lemmatized tokens. |
213 |
| - """ |
214 | 205 | tokens = []
|
215 |
| - for token in doc: |
216 |
| - if ( |
217 |
| - token.text in self.stop_words |
218 |
| - or len(token.text) <= 1 |
219 |
| - or token.text.isnumeric() |
220 |
| - ): |
| 206 | + skill_spans = [] |
| 207 | + # First, add SKILL entity texts as tokens and record their spans. |
| 208 | + for ent in doc.ents: |
| 209 | + if ent.label_ == "SKILL": |
| 210 | + tokens.append(ent.text) |
| 211 | + skill_spans.append((ent.start, ent.end)) |
| 212 | + # Process remaining tokens that are not part of a SKILL entity. |
| 213 | + for i, token in enumerate(doc): |
| 214 | + if any(start <= i < end for start, end in skill_spans): |
| 215 | + continue |
| 216 | + if token.text.lower() in self.stop_words or len(token.text) <= 1 or token.text.isnumeric(): |
221 | 217 | continue
|
222 | 218 | try:
|
223 | 219 | lemma = token.lemma_.lower().strip()
|
@@ -354,71 +350,65 @@ def _generate_synonyms(self, skills: List[str]) -> Set[str]:
|
354 | 350 | )
|
355 | 351 | return synonyms # Return the set of synonyms
|
356 | 352 |
|
| 353 | + # --- Improved _generate_ngrams function --- |
357 | 354 | def _generate_ngrams(self, tokens: List[str], n: int) -> Set[str]:
|
358 |
| - """Generate n-grams from a list of tokens. |
359 |
| -
|
360 |
| - Args: |
361 |
| - tokens: A list of tokens. |
362 |
| - n: The length of the n-grams. |
363 |
| -
|
364 |
| - Returns: |
365 |
| - A set of n-grams. |
| 355 | + """Generate n-grams from tokens and exclude any n-grams that contain single-letter words. |
| 356 | + |
| 357 | + This extra filter ensures that even if a single-letter slips through tokenization, |
| 358 | + the n-grams used for TF-IDF contain only valid multi-character words. |
366 | 359 | """
|
| 360 | + # Remove any tokens that are empty after stripping whitespace. |
367 | 361 | filtered_tokens = [
|
368 |
| - token for token in tokens if token.strip() |
369 |
| - ] # Filter out empty tokens |
370 |
| - return { |
| 362 | + token for token in tokens |
| 363 | + if token.strip() and len(token.strip()) > 1 and token not in self.preprocessor.stop_words |
| 364 | + ] |
| 365 | + if len(filtered_tokens) < n: |
| 366 | + return set() |
| 367 | + # Generate all n-grams from the filtered tokens. |
| 368 | + ngrams = { |
371 | 369 | " ".join(filtered_tokens[i : i + n])
|
372 | 370 | for i in range(len(filtered_tokens) - n + 1)
|
373 |
| - } # Generate and return n-grams |
| 371 | + if all(len(word.strip()) > 1 for word in filtered_tokens[i : i + n]) |
| 372 | + } |
| 373 | + return ngrams |
374 | 374 |
|
375 | 375 | def extract_keywords(self, texts: List[str]) -> List[List[str]]:
|
376 |
| - """Extract keywords from a list of text strings. |
377 |
| -
|
378 |
| - Args: |
379 |
| - texts: A list of text strings. |
380 |
| -
|
381 |
| - Returns: |
382 |
| - A list of lists of keywords. |
383 |
| - """ |
384 |
| - cleaned = self.preprocessor.preprocess_batch( |
385 |
| - texts |
386 |
| - ) # Preprocess the text strings |
387 |
| - tokenized = self.preprocessor.tokenize_batch( |
388 |
| - cleaned |
389 |
| - ) # Tokenize the text strings |
390 |
| - all_keywords = [] # Initialize the list of keywords |
391 |
| - for tokens in tokenized: # Iterate through the tokenized text strings |
392 |
| - keywords = ( |
393 |
| - set() |
394 |
| - ) # Initialize the set of keywords for the current text string |
395 |
| - min_n = min( |
396 |
| - self.whitelist_ngram_range[0], self.ngram_range[0] |
397 |
| - ) # Get the minimum n-gram length |
398 |
| - max_n = max( |
399 |
| - self.whitelist_ngram_range[1], self.ngram_range[1] |
400 |
| - ) # Get the maximum n-gram length |
401 |
| - for n in range(min_n, max_n + 1): # Iterate through the n-gram lengths |
402 |
| - ngrams = self._generate_ngrams( |
403 |
| - tokens, n |
404 |
| - ) # Generate n-grams from the tokens |
405 |
| - if ( |
406 |
| - n >= self.whitelist_ngram_range[0] |
407 |
| - and n <= self.whitelist_ngram_range[1] |
408 |
| - ) or ( |
409 |
| - n >= self.ngram_range[0] and n <= self.ngram_range[1] |
410 |
| - ): # Check if the n-gram length is within the valid range |
411 |
| - keywords.update(ngrams) # Add the n-grams to the set of keywords |
412 |
| - all_keywords.append( |
413 |
| - list(keywords) |
414 |
| - ) # Add the list of keywords to the list of all keywords |
415 |
| - if self.config.get( |
416 |
| - "semantic_validation", False |
417 |
| - ): # Check if semantic validation is enabled |
418 |
| - return self._semantic_filter( |
419 |
| - all_keywords, texts |
420 |
| - ) # Apply semantic filtering |
421 |
| - return all_keywords # Return the list of all keywords |
| 376 | + all_keywords = [] |
| 377 | + for text in texts: |
| 378 | + doc = self.nlp(text) |
| 379 | + # Extract preserved SKILL entities. |
| 380 | + entity_keywords = [ent.text for ent in doc.ents if ent.label_ == "SKILL"] |
| 381 | + |
| 382 | + # Exclude SKILL entities from regular tokenization. |
| 383 | + skill_spans = [(ent.start, ent.end) for ent in doc.ents if ent.label_ == "SKILL"] |
| 384 | + non_entity_tokens = [] |
| 385 | + for i, token in enumerate(doc): |
| 386 | + if any(start <= i < end for start, end in skill_spans): |
| 387 | + continue |
| 388 | + non_entity_tokens.append(token.text) |
| 389 | + |
| 390 | + # Preprocess and split to get cleaned tokens. |
| 391 | + preprocessed_text = self.preprocessor.preprocess(" ".join(non_entity_tokens)) |
| 392 | + token_list = preprocessed_text.split() |
| 393 | + |
| 394 | + non_entity_keywords = set() |
| 395 | + # Generate n-grams only for non-entity tokens. |
| 396 | + for n in range(self.ngram_range[0], self.ngram_range[1] + 1): |
| 397 | + non_entity_keywords.update(self._generate_ngrams(token_list, n)) |
| 398 | + |
| 399 | + # Combine entity and non-entity keywords. |
| 400 | + keywords = set(entity_keywords) | non_entity_keywords |
| 401 | + filtered_keywords = [ |
| 402 | + kw for kw in keywords |
| 403 | + if len(kw.strip()) > 1 |
| 404 | + and not any(len(w.strip()) <= 1 for w in kw.split()) |
| 405 | + and not all(w in self.preprocessor.stop_words for w in kw.split()) |
| 406 | + ] |
| 407 | + all_keywords.append(filtered_keywords) |
| 408 | + |
| 409 | + if self.config.get("semantic_validation", False): |
| 410 | + return self._semantic_filter(all_keywords, texts) |
| 411 | + return all_keywords |
422 | 412 |
|
423 | 413 | def _semantic_filter(
|
424 | 414 | self, keyword_lists: List[List[str]], texts: List[str]
|
@@ -610,6 +600,11 @@ def __init__(self, config_path: str = "config.yaml"):
|
610 | 600 | "spaCy version <3.0 may have compatibility issues"
|
611 | 601 | ) # Warn if the spaCy version is too low
|
612 | 602 |
|
| 603 | + whitelisted_phrases = self.config.get("skills_whitelist", []) |
| 604 | + if whitelisted_phrases and "entity_ruler" in self.nlp.pipe_names: |
| 605 | + patterns = [{"label": "SKILL", "pattern": phrase} for phrase in whitelisted_phrases] |
| 606 | + self.nlp.get_pipe("entity_ruler").add_patterns(patterns) |
| 607 | + |
613 | 608 | def _add_entity_ruler(self):
|
614 | 609 | """Add an EntityRuler to the spaCy pipeline for section detection.
|
615 | 610 |
|
@@ -846,6 +841,8 @@ def _try_load_model(self, model_name):
|
846 | 841 | nlp = spacy.load(model_name, disable=["parser", "ner"])
|
847 | 842 | if "lemmatizer" not in nlp.pipe_names:
|
848 | 843 | nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
| 844 | + if "sentencizer" not in nlp.pipe_names: |
| 845 | + nlp.add_pipe("sentencizer") |
849 | 846 | return nlp
|
850 | 847 | except OSError:
|
851 | 848 | return None
|
@@ -884,16 +881,16 @@ def _load_and_configure_spacy_model(self):
|
884 | 881 | for attempt in range(retry_attempts + 1):
|
885 | 882 | nlp = self._try_load_model(model)
|
886 | 883 | if nlp:
|
| 884 | + logger.debug(f"Loaded spaCy pipeline: {nlp.pipe_names}") |
887 | 885 | return nlp
|
888 | 886 | logger.warning(
|
889 | 887 | f"Model '{model}' not found. Attempt {attempt + 1}/{retry_attempts + 1}"
|
890 | 888 | )
|
891 |
| - if ( |
892 |
| - attempt < retry_attempts and model == model_name |
893 |
| - ): # Only download the specified model |
| 889 | + if attempt < retry_attempts and model == model_name: |
894 | 890 | if self._download_model(model):
|
895 | 891 | nlp = self._try_load_model(model)
|
896 | 892 | if nlp:
|
| 893 | + logger.debug(f"Loaded spaCy pipeline: {nlp.pipe_names}") |
897 | 894 | return nlp
|
898 | 895 | time.sleep(2) # Add a delay of 2 seconds between retries
|
899 | 896 |
|
@@ -926,31 +923,41 @@ def _calculate_scores(self, dtm, feature_names, keyword_sets, job_descriptions):
|
926 | 923 | })
|
927 | 924 | return results
|
928 | 925 |
|
| 926 | + # --- Improved _create_tfidf_matrix function --- |
929 | 927 | def _create_tfidf_matrix(self, texts, keyword_sets):
|
930 |
| - """Create and fit the TF-IDF vectorizer and transform the keyword sets. |
931 |
| -
|
932 |
| - Args: |
933 |
| - texts: List of (preprocessed) job description texts. |
934 |
| - keyword_sets: List of lists of keywords (one list per job description). |
| 928 | + """Create and return a TF-IDF matrix for the job descriptions using pre-validated keyword sets. |
935 | 929 |
|
936 |
| - Returns: |
937 |
| - A tuple containing: |
938 |
| - - The document-term matrix (sparse matrix). |
939 |
| - - The list of feature names |
| 930 | + The vectorizer is adjusted to treat each document as a pre-tokenized list, |
| 931 | + preserving multi-word keywords without re-tokenization. As an extra safety |
| 932 | + measure, we also filter each document's keyword set to remove invalid tokens. |
940 | 933 | """
|
941 | 934 | max_features = self.config.get("tfidf_max_features", 10000)
|
942 | 935 | vectorizer = TfidfVectorizer(
|
943 | 936 | ngram_range=self.keyword_extractor.ngram_range,
|
944 | 937 | lowercase=False,
|
| 938 | + # Use identity functions so that input is treated as pre-tokenized lists. |
945 | 939 | tokenizer=lambda x: x,
|
946 | 940 | preprocessor=lambda x: x,
|
947 | 941 | max_features=max_features,
|
948 | 942 | dtype=np.float32,
|
949 | 943 | )
|
950 |
| - dtm = vectorizer.fit_transform([" ".join(kw) for kw in keyword_sets]) |
951 |
| - if len(vectorizer.get_feature_names_out()) == max_features: |
| 944 | + # Validate each document's keyword set to discard any term that includes a word of length <=1. |
| 945 | + validated_sets = [ |
| 946 | + [kw for kw in kw_set if all(len(word) > 1 for word in kw.split())] |
| 947 | + for kw_set in keyword_sets |
| 948 | + ] |
| 949 | + logger.debug(f"Validated keyword sets sample: {validated_sets[:2]}") |
| 950 | + try: |
| 951 | + dtm = vectorizer.fit_transform(validated_sets) |
| 952 | + except ValueError as e: |
| 953 | + logger.error(f"TF-IDF vectorization failed: {e}. Check keyword_sets content.") |
| 954 | + return None, [] |
| 955 | + feature_names = vectorizer.get_feature_names_out() |
| 956 | + if len(feature_names) == max_features: |
952 | 957 | logger.warning(f"TF-IDF vocabulary reached the limit of {max_features} features")
|
953 |
| - return dtm, vectorizer.get_feature_names_out() |
| 958 | + if not feature_names.size: |
| 959 | + logger.warning("No features extracted by TF-IDF. Check input keyword sets.") |
| 960 | + return dtm, feature_names |
954 | 961 |
|
955 | 962 | def analyze_jobs(self, job_descriptions: Dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
956 | 963 | """Analyze job descriptions and extract keywords.
|
|
0 commit comments