feat: indexing bayesian average ranking for courses in algolia

openedx · Jan 26, 2024 · bcaa529 · bcaa529
1 parent 25cb40c
commit bcaa529
Show file tree

Hide file tree

Showing 8 changed files with 182 additions and 2 deletions.
diff --git a/enterprise_catalog/apps/api/v1/views/enterprise_catalog_get_content_metadata.py b/enterprise_catalog/apps/api/v1/views/enterprise_catalog_get_content_metadata.py
@@ -38,7 +38,7 @@ def enterprise_catalog(self):
 
     def get_permission_object(self):
         """
-        Retrieves the apporpriate object to use during edx-rbac's permission checks.
+        Retrieves the appropriate object to use during edx-rbac's permission checks.
 
         This object is passed to the rule predicate(s).
         """

diff --git a/enterprise_catalog/apps/api_client/constants.py b/enterprise_catalog/apps/api_client/constants.py
@@ -13,6 +13,13 @@
 DISCOVERY_COURSE_REVIEWS_ENDPOINT = urljoin(settings.DISCOVERY_SERVICE_API_URL, 'course_review/')
 DISCOVERY_OFFSET_SIZE = 200
 DISCOVERY_CATALOG_QUERY_CACHE_KEY_TPL = 'catalog_query:{id}'
+DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY = 'average_course_review'
+
+COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER = 15
+
+# As of 1/26/24 this is calculated from Snowflake:
+# SELECT SUM(REVIEWS_COUNT * AVG_COURSE_RATING)/SUM(REVIEWS_COUNT) FROM enterprise.course_reviews
+COURSE_REVIEW_BASE_AVG_REVIEW_SCORE = 4.5
 
 # Enterprise API Client Constants
 ENTERPRISE_API_URL = urljoin(settings.LMS_BASE_URL, '/enterprise/api/v1/')

diff --git a/enterprise_catalog/apps/api_client/discovery.py b/enterprise_catalog/apps/api_client/discovery.py
@@ -7,9 +7,11 @@
 import requests
 from celery.exceptions import SoftTimeLimitExceeded
 from django.conf import settings
+from django.core.cache import cache
 
 from .base_oauth import BaseOAuthClient
 from .constants import (
+    DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
     DISCOVERY_COURSE_REVIEWS_ENDPOINT,
     DISCOVERY_COURSES_ENDPOINT,
     DISCOVERY_OFFSET_SIZE,

diff --git a/enterprise_catalog/apps/catalog/algolia_utils.py b/enterprise_catalog/apps/catalog/algolia_utils.py
@@ -4,10 +4,18 @@
 import time
 
 from dateutil import parser
+from django.conf import settings
+from django.core.cache import cache
+from django.db.models import Q
 from django.utils.translation import gettext as _
 
 from enterprise_catalog.apps.api.v1.utils import is_course_run_active
 from enterprise_catalog.apps.api_client.algolia import AlgoliaSearchClient
+from enterprise_catalog.apps.api_client.constants import (
+    COURSE_REVIEW_BASE_AVG_REVIEW_SCORE,
+    COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER,
+    DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
+)
 from enterprise_catalog.apps.catalog.constants import (
     COURSE,
     EXEC_ED_2U_COURSE_TYPE,
@@ -17,7 +25,7 @@
     PROGRAM_TYPES_MAP,
 )
 from enterprise_catalog.apps.catalog.models import ContentMetadata
-from enterprise_catalog.apps.catalog.utils import localized_utcnow
+from enterprise_catalog.apps.catalog.utils import localized_utcnow, batch_by_pk
 
 
 logger = logging.getLogger(__name__)
@@ -85,6 +93,7 @@
     'normalized_metadata',
     'reviews_count',
     'avg_course_rating',
+    'course_bayesian_average',
 ]
 
 # default configuration for the index
@@ -137,6 +146,7 @@
     'customRanking': [
         'asc(visible_via_association)',
         'asc(created)',
+        'desc(course_bayesian_average)',
         'desc(recent_enrollment_count)',
     ],
 }
@@ -331,6 +341,69 @@ def get_algolia_object_id(content_type, uuid):
     return None
 
 
+def set_global_course_review_avg():
+    """
+    Retrieve all course reviews from the ContentMetadata records in the database, calculate the average review value
+    and set the value to py-cache.
+    """
+    rolling_rating_sum = 0.0
+    total_number_reviews = 0
+    course_only_filter = Q(content_type='course')
+    # only courses have course reviews
+    for items_batch in batch_by_pk(ContentMetadata, extra_filter=course_only_filter):
+        for item in items_batch:
+            if not item.json_metadata.get('avg_course_rating') or not item.json_metadata.get('reviews_count'):
+                continue
+
+            reviews_count = float(item.json_metadata.get('reviews_count'))
+            avg_rating = int(item.json_metadata.get('avg_course_rating'))
+            logger.info(
+                "set_global_course_review_avg " +
+                f"found {reviews_count} course reviews for course: {item.content_key} with avg score of {avg_rating}"
+            )
+            rolling_rating_sum += avg_rating * reviews_count
+            total_number_reviews += reviews_count
+
+    if rolling_rating_sum == 0 or total_number_reviews == 0:
+        logger.warning("set_global_course_review_avg came up with no ratings, somehow.")
+        return
+
+    total_average_course_rating = rolling_rating_sum / total_number_reviews
+
+    cache.set(
+        DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
+        total_average_course_rating,
+        settings.DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT
+    )
+
+
+def get_global_course_review_avg():
+    """
+    Fetch the calculated global course review average from py-cache
+    """
+    cache_key = DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY
+    total_average_rating = cache.get(cache_key)
+    if total_average_rating is None:
+        return COURSE_REVIEW_BASE_AVG_REVIEW_SCORE
+    return total_average_rating
+
+
+def get_course_bayesian_average(course):
+    """
+    Using the global average review value to calculate an individual course's bayesian average review value.
+    https://www.algolia.com/doc/guides/managing-results/must-do/custom-ranking/how-to/bayesian-average/
+    """
+    total_avg = get_global_course_review_avg()
+    avg_review = course.get('avg_course_rating')
+    ratings_count = course.get('reviews_count')
+    if avg_review is not None and ratings_count is not None:
+        return (
+            (avg_review * ratings_count) + (total_avg * COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER)
+        ) / (ratings_count + COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER)
+    else:
+        return 0
+
+
 def get_course_language(course):
     """
     Gets the human-readable language name associated with the advertised course run. Used for
@@ -1204,6 +1277,7 @@ def _algolia_object_from_product(product, algolia_fields):
             'learning_type_v2': get_learning_type_v2(searchable_product),
             'reviews_count': get_reviews_count(searchable_product),
             'avg_course_rating': get_avg_course_rating(searchable_product),
+            'course_bayesian_average': get_course_bayesian_average(searchable_product),
         })
     elif searchable_product.get('content_type') == PROGRAM:
         searchable_product.update({

diff --git a/...rprise_catalog/apps/catalog/management/commands/set_global_average_course_rating_value.py b/...rprise_catalog/apps/catalog/management/commands/set_global_average_course_rating_value.py
@@ -0,0 +1,21 @@
+import logging
+
+from django.core.management.base import BaseCommand
+
+from enterprise_catalog.apps.catalog.algolia_utils import set_global_course_review_avg
+
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    help = (
+        'Fetch ContentMetadata records, read the course review values and save the average to py-cache.'
+    )
+
+    def handle(self, *args, **options):
+        """
+        Fetch ContentMetadata records, read the course review values and save the average to py-cache.
+        """
+        logger.info("compare_catalog_queries_to_filters queuing task...")
+        set_global_course_review_avg()
diff --git a/...log/apps/catalog/management/commands/tests/test_set_global_average_course_rating_value.py b/...log/apps/catalog/management/commands/tests/test_set_global_average_course_rating_value.py
@@ -0,0 +1,47 @@
+from unittest import mock
+from django.conf import settings
+from django.core.management import call_command
+from django.test import TestCase
+
+from enterprise_catalog.apps.api_client.constants import DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY
+from enterprise_catalog.apps.catalog.tests.factories import ContentMetadataFactory
+
+
+class TestSetGlobalAverageCourseRatingValue(TestCase):
+    command_name = 'set_global_average_course_rating_value'
+
+    @mock.patch('enterprise_catalog.apps.catalog.algolia_utils.cache')
+    def test_command_averages_course_reviews(
+        self, mock_cache,
+    ):
+        """
+        Verify that the command sifts over content metadata and calculates the
+        average review score, setting the value to py-cache.
+        """
+        ContentMetadataFactory(
+            content_type='course',
+            json_metadata={'avg_course_rating': 5, 'reviews_count': 20}
+        )
+        ContentMetadataFactory(
+            content_type='course',
+            json_metadata={'avg_course_rating': 4, 'reviews_count': 10}
+        )
+        ContentMetadataFactory(json_metadata={})
+        call_command(self.command_name)
+        expected_total_average = ((5 * 20) + (4 * 10)) / 30
+
+        mock_cache.set.assert_called_with(
+            DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
+            expected_total_average,
+            settings.DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT,
+        )
+
+    @mock.patch('enterprise_catalog.apps.catalog.algolia_utils.cache')
+    def test_command_handles_no_course_reviews(
+        self, mock_cache,
+    ):
+        """
+        Verify that the job will not blow up if there are no reviews to average.
+        """
+        call_command(self.command_name)
+        mock_cache.set.assert_not_called()
diff --git a/enterprise_catalog/apps/catalog/utils.py b/enterprise_catalog/apps/catalog/utils.py
@@ -8,6 +8,7 @@
 from urllib.parse import urljoin
 
 from django.conf import settings
+from django.db.models import Q
 from edx_rbac.utils import feature_roles_from_jwt
 from edx_rest_framework_extensions.auth.jwt.authentication import (
     get_decoded_jwt_from_auth,
@@ -114,3 +115,26 @@ def enterprise_proxy_login_url(slug, next_url=None):
     if next_url:
         url += f'&next={next_url}'
     return url
+
+
+def batch_by_pk(ModelClass, extra_filter=Q(), batch_size=10000):
+    """
+    yield per batch efficiently
+    using limit/offset does a lot of table scanning to reach higher offsets
+    this scanning can be slow on very large tables
+    if you order by pk, you can use the pk as a pivot rather than offset
+    this utilizes the index, which is faster than scanning to reach offset
+    Example usage:
+    course_only_filter = Q(content_type='course')
+    for items_batch in batch_by_pk(ContentMetadata, extra_filter=course_only_filter):
+        for item in items_batch:
+            ...
+    """
+    qs = ModelClass.objects.filter(extra_filter).order_by('pk')[:batch_size]
+    while qs.exists():
+        yield qs
+        # qs.last() doesn't work here because we've already sliced
+        # loop through so we eventually grab the last one
+        for item in qs:
+            start_pk = item.pk
+        qs = ModelClass.objects.filter(pk__gt=start_pk).filter(extra_filter).order_by('pk')[:batch_size]
diff --git a/enterprise_catalog/settings/base.py b/enterprise_catalog/settings/base.py
@@ -380,9 +380,14 @@
 
 # How long we keep API Client data in cache. (seconds)
 ONE_HOUR = 60 * 60
+ONE_DAY = ONE_HOUR * 24
+ONE_MONTH = ONE_DAY * 31
+ONE_YEAR = ONE_MONTH * 12
+ONE_DECADE = ONE_YEAR * 10
 ENTERPRISE_CUSTOMER_CACHE_TIMEOUT = ONE_HOUR
 DISCOVERY_CATALOG_QUERY_CACHE_TIMEOUT = ONE_HOUR
 DISCOVERY_COURSE_DATA_CACHE_TIMEOUT = ONE_HOUR
+DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT = ONE_DECADE
 
 # URLs
 LMS_BASE_URL = os.environ.get('LMS_BASE_URL', '')