Skip to content

Commit

Permalink
Merge pull request #825 from openedx/saleem-latif/ENT-8324
Browse files Browse the repository at this point in the history
fix: Use percentile for filtering courses instead of tfidf values for uniform distribution
  • Loading branch information
saleem-latif authored May 6, 2024
2 parents 9f3deec + 66b5336 commit 2fe86a0
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 3 deletions.
3 changes: 3 additions & 0 deletions enterprise_catalog/apps/ai_curation/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def setUp(self):
'outcome': 'Learn data science with Python',
'program_titles': [],
'tf_idf_score': 0.3,
'tf_idf_percentile': 0.3,
},
{
'aggregation_key': 'course:MITx+20',
Expand All @@ -53,6 +54,7 @@ def setUp(self):
'outcome': 'Learn Programming Basics',
'program_titles': ['Learn Programming Basics'],
'tf_idf_score': 0.6,
'tf_idf_percentile': 0.6,
},
]
self.partially_filtered_exec_ed_courses = [
Expand All @@ -67,6 +69,7 @@ def setUp(self):
'program_titles': ['Java for data science'],
'outcome': 'Learn data science with Java',
'tf_idf_score': 0.5,
'tf_idf_percentile': 0.5,
},
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def test_apply_tfidf_filter(self, mock_get_query_keywords, mock_create):
},
]

filtered_courses, _ = apply_tfidf_filter('python data science', courses, tfidf_threshold=0.2)
filtered_courses, _ = apply_tfidf_filter('python data science', courses, tfidf_threshold=0.4)

assert {c['title'] for c in filtered_courses} == {'Python for data science', 'Java for data science'}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from django.core.cache import cache
from rest_framework import status
from scipy.stats import percentileofscore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Expand Down Expand Up @@ -130,7 +131,12 @@ def calculate_tfidf_score(query: str, courses: list):
)
])[0]

return sorted(courses, key=lambda item: item['tf_idf_score'], reverse=True)
sorted_by_score = sorted(courses, key=lambda item: item['tf_idf_score'], reverse=True)
scores = [course['tf_idf_score'] for course in sorted_by_score]
for course in sorted_by_score:
course['tf_idf_percentile'] = percentileofscore(scores, course['tf_idf_score']) / 100

return sorted_by_score


def filter_by_threshold(courses: list, tfidf_threshold: float):
Expand All @@ -144,7 +150,7 @@ def filter_by_threshold(courses: list, tfidf_threshold: float):
Returns:
list: List of courses filtered by the TF-IDF score
"""
return [course for course in courses if course['tf_idf_score'] > tfidf_threshold]
return [course for course in courses if course['tf_idf_percentile'] > tfidf_threshold]


def apply_tfidf_filter(query: str, courses: list, tfidf_threshold: float):
Expand Down

0 comments on commit 2fe86a0

Please sign in to comment.