diff --git a/enterprise_catalog/apps/api/v1/views/enterprise_catalog_get_content_metadata.py b/enterprise_catalog/apps/api/v1/views/enterprise_catalog_get_content_metadata.py index 533499ec5..5eb516bf2 100644 --- a/enterprise_catalog/apps/api/v1/views/enterprise_catalog_get_content_metadata.py +++ b/enterprise_catalog/apps/api/v1/views/enterprise_catalog_get_content_metadata.py @@ -38,7 +38,7 @@ def enterprise_catalog(self): def get_permission_object(self): """ - Retrieves the apporpriate object to use during edx-rbac's permission checks. + Retrieves the appropriate object to use during edx-rbac's permission checks. This object is passed to the rule predicate(s). """ diff --git a/enterprise_catalog/apps/api_client/constants.py b/enterprise_catalog/apps/api_client/constants.py index 198ac176d..d979cfe4d 100644 --- a/enterprise_catalog/apps/api_client/constants.py +++ b/enterprise_catalog/apps/api_client/constants.py @@ -13,6 +13,13 @@ DISCOVERY_COURSE_REVIEWS_ENDPOINT = urljoin(settings.DISCOVERY_SERVICE_API_URL, 'course_review/') DISCOVERY_OFFSET_SIZE = 200 DISCOVERY_CATALOG_QUERY_CACHE_KEY_TPL = 'catalog_query:{id}' +DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY = 'average_course_review' + +COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER = 15 + +# As of 1/26/24 this is calculated from Snowflake: +# SELECT SUM(REVIEWS_COUNT * AVG_COURSE_RATING)/SUM(REVIEWS_COUNT) FROM enterprise.course_reviews +COURSE_REVIEW_BASE_AVG_REVIEW_SCORE = 4.5 # Enterprise API Client Constants ENTERPRISE_API_URL = urljoin(settings.LMS_BASE_URL, '/enterprise/api/v1/') diff --git a/enterprise_catalog/apps/api_client/discovery.py b/enterprise_catalog/apps/api_client/discovery.py index 8b00279b8..609ae11d8 100644 --- a/enterprise_catalog/apps/api_client/discovery.py +++ b/enterprise_catalog/apps/api_client/discovery.py @@ -7,9 +7,11 @@ import requests from celery.exceptions import SoftTimeLimitExceeded from django.conf import settings +from django.core.cache import cache from .base_oauth import BaseOAuthClient from .constants import ( + DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY, DISCOVERY_COURSE_REVIEWS_ENDPOINT, DISCOVERY_COURSES_ENDPOINT, DISCOVERY_OFFSET_SIZE, diff --git a/enterprise_catalog/apps/catalog/algolia_utils.py b/enterprise_catalog/apps/catalog/algolia_utils.py index 3f511f484..550924dfb 100644 --- a/enterprise_catalog/apps/catalog/algolia_utils.py +++ b/enterprise_catalog/apps/catalog/algolia_utils.py @@ -4,10 +4,18 @@ import time from dateutil import parser +from django.conf import settings +from django.core.cache import cache +from django.db.models import Q from django.utils.translation import gettext as _ from enterprise_catalog.apps.api.v1.utils import is_course_run_active from enterprise_catalog.apps.api_client.algolia import AlgoliaSearchClient +from enterprise_catalog.apps.api_client.constants import ( + COURSE_REVIEW_BASE_AVG_REVIEW_SCORE, + COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER, + DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY, +) from enterprise_catalog.apps.catalog.constants import ( COURSE, EXEC_ED_2U_COURSE_TYPE, @@ -17,7 +25,7 @@ PROGRAM_TYPES_MAP, ) from enterprise_catalog.apps.catalog.models import ContentMetadata -from enterprise_catalog.apps.catalog.utils import localized_utcnow +from enterprise_catalog.apps.catalog.utils import localized_utcnow, batch_by_pk logger = logging.getLogger(__name__) @@ -85,6 +93,7 @@ 'normalized_metadata', 'reviews_count', 'avg_course_rating', + 'course_bayesian_average', ] # default configuration for the index @@ -137,6 +146,7 @@ 'customRanking': [ 'asc(visible_via_association)', 'asc(created)', + 'desc(course_bayesian_average)', 'desc(recent_enrollment_count)', ], } @@ -331,6 +341,69 @@ def get_algolia_object_id(content_type, uuid): return None +def set_global_course_review_avg(): + """ + Retrieve all course reviews from the ContentMetadata records in the database, calculate the average review value + and set the value to py-cache. + """ + rolling_rating_sum = 0.0 + total_number_reviews = 0 + course_only_filter = Q(content_type='course') + # only courses have course reviews + for items_batch in batch_by_pk(ContentMetadata, extra_filter=course_only_filter): + for item in items_batch: + if not item.json_metadata.get('avg_course_rating') or not item.json_metadata.get('reviews_count'): + continue + + reviews_count = float(item.json_metadata.get('reviews_count')) + avg_rating = int(item.json_metadata.get('avg_course_rating')) + logger.info( + "set_global_course_review_avg " + + f"found {reviews_count} course reviews for course: {item.content_key} with avg score of {avg_rating}" + ) + rolling_rating_sum += avg_rating * reviews_count + total_number_reviews += reviews_count + + if rolling_rating_sum == 0 or total_number_reviews == 0: + logger.warning("set_global_course_review_avg came up with no ratings, somehow.") + return + + total_average_course_rating = rolling_rating_sum / total_number_reviews + + cache.set( + DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY, + total_average_course_rating, + settings.DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT + ) + + +def get_global_course_review_avg(): + """ + Fetch the calculated global course review average from py-cache + """ + cache_key = DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY + total_average_rating = cache.get(cache_key) + if total_average_rating is None: + return COURSE_REVIEW_BASE_AVG_REVIEW_SCORE + return total_average_rating + + +def get_course_bayesian_average(course): + """ + Using the global average review value to calculate an individual course's bayesian average review value. + https://www.algolia.com/doc/guides/managing-results/must-do/custom-ranking/how-to/bayesian-average/ + """ + total_avg = get_global_course_review_avg() + avg_review = course.get('avg_course_rating') + ratings_count = course.get('reviews_count') + if avg_review is not None and ratings_count is not None: + return ( + (avg_review * ratings_count) + (total_avg * COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER) + ) / (ratings_count + COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER) + else: + return 0 + + def get_course_language(course): """ Gets the human-readable language name associated with the advertised course run. Used for @@ -1204,6 +1277,7 @@ def _algolia_object_from_product(product, algolia_fields): 'learning_type_v2': get_learning_type_v2(searchable_product), 'reviews_count': get_reviews_count(searchable_product), 'avg_course_rating': get_avg_course_rating(searchable_product), + 'course_bayesian_average': get_course_bayesian_average(searchable_product), }) elif searchable_product.get('content_type') == PROGRAM: searchable_product.update({ diff --git a/enterprise_catalog/apps/catalog/management/commands/set_global_average_course_rating_value.py b/enterprise_catalog/apps/catalog/management/commands/set_global_average_course_rating_value.py new file mode 100644 index 000000000..8699257ea --- /dev/null +++ b/enterprise_catalog/apps/catalog/management/commands/set_global_average_course_rating_value.py @@ -0,0 +1,21 @@ +import logging + +from django.core.management.base import BaseCommand + +from enterprise_catalog.apps.catalog.algolia_utils import set_global_course_review_avg + + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = ( + 'Fetch ContentMetadata records, read the course review values and save the average to py-cache.' + ) + + def handle(self, *args, **options): + """ + Fetch ContentMetadata records, read the course review values and save the average to py-cache. + """ + logger.info("compare_catalog_queries_to_filters queuing task...") + set_global_course_review_avg() diff --git a/enterprise_catalog/apps/catalog/management/commands/tests/test_set_global_average_course_rating_value.py b/enterprise_catalog/apps/catalog/management/commands/tests/test_set_global_average_course_rating_value.py new file mode 100644 index 000000000..fcedd1a62 --- /dev/null +++ b/enterprise_catalog/apps/catalog/management/commands/tests/test_set_global_average_course_rating_value.py @@ -0,0 +1,47 @@ +from unittest import mock +from django.conf import settings +from django.core.management import call_command +from django.test import TestCase + +from enterprise_catalog.apps.api_client.constants import DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY +from enterprise_catalog.apps.catalog.tests.factories import ContentMetadataFactory + + +class TestSetGlobalAverageCourseRatingValue(TestCase): + command_name = 'set_global_average_course_rating_value' + + @mock.patch('enterprise_catalog.apps.catalog.algolia_utils.cache') + def test_command_averages_course_reviews( + self, mock_cache, + ): + """ + Verify that the command sifts over content metadata and calculates the + average review score, setting the value to py-cache. + """ + ContentMetadataFactory( + content_type='course', + json_metadata={'avg_course_rating': 5, 'reviews_count': 20} + ) + ContentMetadataFactory( + content_type='course', + json_metadata={'avg_course_rating': 4, 'reviews_count': 10} + ) + ContentMetadataFactory(json_metadata={}) + call_command(self.command_name) + expected_total_average = ((5 * 20) + (4 * 10)) / 30 + + mock_cache.set.assert_called_with( + DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY, + expected_total_average, + settings.DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT, + ) + + @mock.patch('enterprise_catalog.apps.catalog.algolia_utils.cache') + def test_command_handles_no_course_reviews( + self, mock_cache, + ): + """ + Verify that the job will not blow up if there are no reviews to average. + """ + call_command(self.command_name) + mock_cache.set.assert_not_called() diff --git a/enterprise_catalog/apps/catalog/utils.py b/enterprise_catalog/apps/catalog/utils.py index 9b142f8c0..8eced9426 100644 --- a/enterprise_catalog/apps/catalog/utils.py +++ b/enterprise_catalog/apps/catalog/utils.py @@ -8,6 +8,7 @@ from urllib.parse import urljoin from django.conf import settings +from django.db.models import Q from edx_rbac.utils import feature_roles_from_jwt from edx_rest_framework_extensions.auth.jwt.authentication import ( get_decoded_jwt_from_auth, @@ -114,3 +115,26 @@ def enterprise_proxy_login_url(slug, next_url=None): if next_url: url += f'&next={next_url}' return url + + +def batch_by_pk(ModelClass, extra_filter=Q(), batch_size=10000): + """ + yield per batch efficiently + using limit/offset does a lot of table scanning to reach higher offsets + this scanning can be slow on very large tables + if you order by pk, you can use the pk as a pivot rather than offset + this utilizes the index, which is faster than scanning to reach offset + Example usage: + course_only_filter = Q(content_type='course') + for items_batch in batch_by_pk(ContentMetadata, extra_filter=course_only_filter): + for item in items_batch: + ... + """ + qs = ModelClass.objects.filter(extra_filter).order_by('pk')[:batch_size] + while qs.exists(): + yield qs + # qs.last() doesn't work here because we've already sliced + # loop through so we eventually grab the last one + for item in qs: + start_pk = item.pk + qs = ModelClass.objects.filter(pk__gt=start_pk).filter(extra_filter).order_by('pk')[:batch_size] diff --git a/enterprise_catalog/settings/base.py b/enterprise_catalog/settings/base.py index 55aef03fe..4a9bd5a2d 100644 --- a/enterprise_catalog/settings/base.py +++ b/enterprise_catalog/settings/base.py @@ -380,9 +380,14 @@ # How long we keep API Client data in cache. (seconds) ONE_HOUR = 60 * 60 +ONE_DAY = ONE_HOUR * 24 +ONE_MONTH = ONE_DAY * 31 +ONE_YEAR = ONE_MONTH * 12 +ONE_DECADE = ONE_YEAR * 10 ENTERPRISE_CUSTOMER_CACHE_TIMEOUT = ONE_HOUR DISCOVERY_CATALOG_QUERY_CACHE_TIMEOUT = ONE_HOUR DISCOVERY_COURSE_DATA_CACHE_TIMEOUT = ONE_HOUR +DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT = ONE_DECADE # URLs LMS_BASE_URL = os.environ.get('LMS_BASE_URL', '')