From 66b5336524b61f899b5a90217c2d659317f82a30 Mon Sep 17 00:00:00 2001 From: Saleem Latif Date: Fri, 3 May 2024 18:53:20 +0500 Subject: [PATCH] fix: Use percentile for filtering courses insteadof tfidf values for uniform distribution. --- .../apps/ai_curation/tests/test_views.py | 3 +++ .../tests/utils/test_generate_curation_utils.py | 2 +- .../apps/ai_curation/utils/generate_curation_utils.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/enterprise_catalog/apps/ai_curation/tests/test_views.py b/enterprise_catalog/apps/ai_curation/tests/test_views.py index f3d0d74bd..0f2b0ffc2 100644 --- a/enterprise_catalog/apps/ai_curation/tests/test_views.py +++ b/enterprise_catalog/apps/ai_curation/tests/test_views.py @@ -41,6 +41,7 @@ def setUp(self): 'outcome': 'Learn data science with Python', 'program_titles': [], 'tf_idf_score': 0.3, + 'tf_idf_percentile': 0.3, }, { 'aggregation_key': 'course:MITx+20', @@ -53,6 +54,7 @@ def setUp(self): 'outcome': 'Learn Programming Basics', 'program_titles': ['Learn Programming Basics'], 'tf_idf_score': 0.6, + 'tf_idf_percentile': 0.6, }, ] self.partially_filtered_exec_ed_courses = [ @@ -67,6 +69,7 @@ def setUp(self): 'program_titles': ['Java for data science'], 'outcome': 'Learn data science with Java', 'tf_idf_score': 0.5, + 'tf_idf_percentile': 0.5, }, ] diff --git a/enterprise_catalog/apps/ai_curation/tests/utils/test_generate_curation_utils.py b/enterprise_catalog/apps/ai_curation/tests/utils/test_generate_curation_utils.py index dfabebcb1..b2df8d991 100644 --- a/enterprise_catalog/apps/ai_curation/tests/utils/test_generate_curation_utils.py +++ b/enterprise_catalog/apps/ai_curation/tests/utils/test_generate_curation_utils.py @@ -127,7 +127,7 @@ def test_apply_tfidf_filter(self, mock_get_query_keywords, mock_create): }, ] - filtered_courses, _ = apply_tfidf_filter('python data science', courses, tfidf_threshold=0.2) + filtered_courses, _ = apply_tfidf_filter('python data science', courses, tfidf_threshold=0.4) assert {c['title'] for c in filtered_courses} == {'Python for data science', 'Java for data science'} diff --git a/enterprise_catalog/apps/ai_curation/utils/generate_curation_utils.py b/enterprise_catalog/apps/ai_curation/utils/generate_curation_utils.py index 1286f3149..c2269fad8 100644 --- a/enterprise_catalog/apps/ai_curation/utils/generate_curation_utils.py +++ b/enterprise_catalog/apps/ai_curation/utils/generate_curation_utils.py @@ -3,6 +3,7 @@ """ from django.core.cache import cache from rest_framework import status +from scipy.stats import percentileofscore from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity @@ -130,7 +131,12 @@ def calculate_tfidf_score(query: str, courses: list): ) ])[0] - return sorted(courses, key=lambda item: item['tf_idf_score'], reverse=True) + sorted_by_score = sorted(courses, key=lambda item: item['tf_idf_score'], reverse=True) + scores = [course['tf_idf_score'] for course in sorted_by_score] + for course in sorted_by_score: + course['tf_idf_percentile'] = percentileofscore(scores, course['tf_idf_score']) / 100 + + return sorted_by_score def filter_by_threshold(courses: list, tfidf_threshold: float): @@ -144,7 +150,7 @@ def filter_by_threshold(courses: list, tfidf_threshold: float): Returns: list: List of courses filtered by the TF-IDF score """ - return [course for course in courses if course['tf_idf_score'] > tfidf_threshold] + return [course for course in courses if course['tf_idf_percentile'] > tfidf_threshold] def apply_tfidf_filter(query: str, courses: list, tfidf_threshold: float):