Skip to content

Commit

Permalink
feat: indexing bayesian average ranking for courses in algolia
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-sheehan-edx committed Jan 26, 2024
1 parent 25cb40c commit bcaa529
Show file tree
Hide file tree
Showing 8 changed files with 182 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def enterprise_catalog(self):

def get_permission_object(self):
"""
Retrieves the apporpriate object to use during edx-rbac's permission checks.
Retrieves the appropriate object to use during edx-rbac's permission checks.
This object is passed to the rule predicate(s).
"""
Expand Down
7 changes: 7 additions & 0 deletions enterprise_catalog/apps/api_client/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
DISCOVERY_COURSE_REVIEWS_ENDPOINT = urljoin(settings.DISCOVERY_SERVICE_API_URL, 'course_review/')
DISCOVERY_OFFSET_SIZE = 200
DISCOVERY_CATALOG_QUERY_CACHE_KEY_TPL = 'catalog_query:{id}'
DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY = 'average_course_review'

COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER = 15

# As of 1/26/24 this is calculated from Snowflake:
# SELECT SUM(REVIEWS_COUNT * AVG_COURSE_RATING)/SUM(REVIEWS_COUNT) FROM enterprise.course_reviews
COURSE_REVIEW_BASE_AVG_REVIEW_SCORE = 4.5

# Enterprise API Client Constants
ENTERPRISE_API_URL = urljoin(settings.LMS_BASE_URL, '/enterprise/api/v1/')
Expand Down
2 changes: 2 additions & 0 deletions enterprise_catalog/apps/api_client/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import requests
from celery.exceptions import SoftTimeLimitExceeded
from django.conf import settings
from django.core.cache import cache

from .base_oauth import BaseOAuthClient
from .constants import (
DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
DISCOVERY_COURSE_REVIEWS_ENDPOINT,
DISCOVERY_COURSES_ENDPOINT,
DISCOVERY_OFFSET_SIZE,
Expand Down
76 changes: 75 additions & 1 deletion enterprise_catalog/apps/catalog/algolia_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@
import time

from dateutil import parser
from django.conf import settings
from django.core.cache import cache
from django.db.models import Q
from django.utils.translation import gettext as _

from enterprise_catalog.apps.api.v1.utils import is_course_run_active
from enterprise_catalog.apps.api_client.algolia import AlgoliaSearchClient
from enterprise_catalog.apps.api_client.constants import (
COURSE_REVIEW_BASE_AVG_REVIEW_SCORE,
COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER,
DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
)
from enterprise_catalog.apps.catalog.constants import (
COURSE,
EXEC_ED_2U_COURSE_TYPE,
Expand All @@ -17,7 +25,7 @@
PROGRAM_TYPES_MAP,
)
from enterprise_catalog.apps.catalog.models import ContentMetadata
from enterprise_catalog.apps.catalog.utils import localized_utcnow
from enterprise_catalog.apps.catalog.utils import localized_utcnow, batch_by_pk


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -85,6 +93,7 @@
'normalized_metadata',
'reviews_count',
'avg_course_rating',
'course_bayesian_average',
]

# default configuration for the index
Expand Down Expand Up @@ -137,6 +146,7 @@
'customRanking': [
'asc(visible_via_association)',
'asc(created)',
'desc(course_bayesian_average)',
'desc(recent_enrollment_count)',
],
}
Expand Down Expand Up @@ -331,6 +341,69 @@ def get_algolia_object_id(content_type, uuid):
return None


def set_global_course_review_avg():
"""
Retrieve all course reviews from the ContentMetadata records in the database, calculate the average review value
and set the value to py-cache.
"""
rolling_rating_sum = 0.0
total_number_reviews = 0
course_only_filter = Q(content_type='course')
# only courses have course reviews
for items_batch in batch_by_pk(ContentMetadata, extra_filter=course_only_filter):
for item in items_batch:
if not item.json_metadata.get('avg_course_rating') or not item.json_metadata.get('reviews_count'):
continue

reviews_count = float(item.json_metadata.get('reviews_count'))
avg_rating = int(item.json_metadata.get('avg_course_rating'))
logger.info(
"set_global_course_review_avg " +
f"found {reviews_count} course reviews for course: {item.content_key} with avg score of {avg_rating}"
)
rolling_rating_sum += avg_rating * reviews_count
total_number_reviews += reviews_count

if rolling_rating_sum == 0 or total_number_reviews == 0:
logger.warning("set_global_course_review_avg came up with no ratings, somehow.")
return

total_average_course_rating = rolling_rating_sum / total_number_reviews

cache.set(
DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
total_average_course_rating,
settings.DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT
)


def get_global_course_review_avg():
"""
Fetch the calculated global course review average from py-cache
"""
cache_key = DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY
total_average_rating = cache.get(cache_key)
if total_average_rating is None:
return COURSE_REVIEW_BASE_AVG_REVIEW_SCORE
return total_average_rating


def get_course_bayesian_average(course):
"""
Using the global average review value to calculate an individual course's bayesian average review value.
https://www.algolia.com/doc/guides/managing-results/must-do/custom-ranking/how-to/bayesian-average/
"""
total_avg = get_global_course_review_avg()
avg_review = course.get('avg_course_rating')
ratings_count = course.get('reviews_count')
if avg_review is not None and ratings_count is not None:
return (
(avg_review * ratings_count) + (total_avg * COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER)
) / (ratings_count + COURSE_REVIEW_BAYESIAN_CONFIDENCE_NUMBER)
else:
return 0


def get_course_language(course):
"""
Gets the human-readable language name associated with the advertised course run. Used for
Expand Down Expand Up @@ -1204,6 +1277,7 @@ def _algolia_object_from_product(product, algolia_fields):
'learning_type_v2': get_learning_type_v2(searchable_product),
'reviews_count': get_reviews_count(searchable_product),
'avg_course_rating': get_avg_course_rating(searchable_product),
'course_bayesian_average': get_course_bayesian_average(searchable_product),
})
elif searchable_product.get('content_type') == PROGRAM:
searchable_product.update({
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import logging

from django.core.management.base import BaseCommand

from enterprise_catalog.apps.catalog.algolia_utils import set_global_course_review_avg


logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = (
'Fetch ContentMetadata records, read the course review values and save the average to py-cache.'
)

def handle(self, *args, **options):
"""
Fetch ContentMetadata records, read the course review values and save the average to py-cache.
"""
logger.info("compare_catalog_queries_to_filters queuing task...")
set_global_course_review_avg()
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unittest import mock
from django.conf import settings
from django.core.management import call_command
from django.test import TestCase

from enterprise_catalog.apps.api_client.constants import DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY
from enterprise_catalog.apps.catalog.tests.factories import ContentMetadataFactory


class TestSetGlobalAverageCourseRatingValue(TestCase):
command_name = 'set_global_average_course_rating_value'

@mock.patch('enterprise_catalog.apps.catalog.algolia_utils.cache')
def test_command_averages_course_reviews(
self, mock_cache,
):
"""
Verify that the command sifts over content metadata and calculates the
average review score, setting the value to py-cache.
"""
ContentMetadataFactory(
content_type='course',
json_metadata={'avg_course_rating': 5, 'reviews_count': 20}
)
ContentMetadataFactory(
content_type='course',
json_metadata={'avg_course_rating': 4, 'reviews_count': 10}
)
ContentMetadataFactory(json_metadata={})
call_command(self.command_name)
expected_total_average = ((5 * 20) + (4 * 10)) / 30

mock_cache.set.assert_called_with(
DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_KEY,
expected_total_average,
settings.DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT,
)

@mock.patch('enterprise_catalog.apps.catalog.algolia_utils.cache')
def test_command_handles_no_course_reviews(
self, mock_cache,
):
"""
Verify that the job will not blow up if there are no reviews to average.
"""
call_command(self.command_name)
mock_cache.set.assert_not_called()
24 changes: 24 additions & 0 deletions enterprise_catalog/apps/catalog/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from urllib.parse import urljoin

from django.conf import settings
from django.db.models import Q
from edx_rbac.utils import feature_roles_from_jwt
from edx_rest_framework_extensions.auth.jwt.authentication import (
get_decoded_jwt_from_auth,
Expand Down Expand Up @@ -114,3 +115,26 @@ def enterprise_proxy_login_url(slug, next_url=None):
if next_url:
url += f'&next={next_url}'
return url


def batch_by_pk(ModelClass, extra_filter=Q(), batch_size=10000):
"""
yield per batch efficiently
using limit/offset does a lot of table scanning to reach higher offsets
this scanning can be slow on very large tables
if you order by pk, you can use the pk as a pivot rather than offset
this utilizes the index, which is faster than scanning to reach offset
Example usage:
course_only_filter = Q(content_type='course')
for items_batch in batch_by_pk(ContentMetadata, extra_filter=course_only_filter):
for item in items_batch:
...
"""
qs = ModelClass.objects.filter(extra_filter).order_by('pk')[:batch_size]
while qs.exists():
yield qs
# qs.last() doesn't work here because we've already sliced
# loop through so we eventually grab the last one
for item in qs:
start_pk = item.pk
qs = ModelClass.objects.filter(pk__gt=start_pk).filter(extra_filter).order_by('pk')[:batch_size]
5 changes: 5 additions & 0 deletions enterprise_catalog/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,9 +380,14 @@

# How long we keep API Client data in cache. (seconds)
ONE_HOUR = 60 * 60
ONE_DAY = ONE_HOUR * 24
ONE_MONTH = ONE_DAY * 31
ONE_YEAR = ONE_MONTH * 12
ONE_DECADE = ONE_YEAR * 10
ENTERPRISE_CUSTOMER_CACHE_TIMEOUT = ONE_HOUR
DISCOVERY_CATALOG_QUERY_CACHE_TIMEOUT = ONE_HOUR
DISCOVERY_COURSE_DATA_CACHE_TIMEOUT = ONE_HOUR
DISCOVERY_AVERAGE_COURSE_REVIEW_CACHE_TIMEOUT = ONE_DECADE

# URLs
LMS_BASE_URL = os.environ.get('LMS_BASE_URL', '')
Expand Down

0 comments on commit bcaa529

Please sign in to comment.