Skip to content

Commit

Permalink
Merge pull request #500 from openedx/iahmad/ENT-9436
Browse files Browse the repository at this point in the history
feat: Transformed pandas processing into SQL for leaderboard
  • Loading branch information
saleem-latif authored Sep 25, 2024
2 parents 5e93c1a + 0666de6 commit a3eca81
Show file tree
Hide file tree
Showing 20 changed files with 277 additions and 911 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ Unreleased

=========================

[9.2.0] - 2024-09-25
---------------------
* refactor: Performance optimizations for leaderboard API endpoints

[9.1.1] - 2024-09-24
---------------------
* fix: disable caching for EnterpriseLearnerEnrollmentViewSet
Expand Down
2 changes: 1 addition & 1 deletion enterprise_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
Enterprise data api application. This Django app exposes API endpoints used by enterprises.
"""

__version__ = "9.1.1"
__version__ = "9.2.0"
20 changes: 3 additions & 17 deletions enterprise_data/admin_analytics/constants.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,10 @@
"""Advanced Analytics Constants"""
"""
Constants for admin analytics app.
"""

from enum import Enum


class Granularity(Enum):
"""Granularity choices"""
DAILY = 'Daily'
WEEKLY = 'Weekly'
MONTHLY = 'Monthly'
QUARTERLY = 'Quarterly'


class Calculation(Enum):
"""Calculation choices"""
TOTAL = 'Total'
RUNNING_TOTAL = 'Running Total'
MOVING_AVERAGE_3_PERIOD = 'Moving Average (3 Period)'
MOVING_AVERAGE_7_PERIOD = 'Moving Average (7 Period)'


class ResponseType(Enum):
"""Response type choices"""
JSON = 'json'
Expand Down
125 changes: 0 additions & 125 deletions enterprise_data/admin_analytics/data_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,138 +3,13 @@
"""
from logging import getLogger

import numpy
import pandas

from django.http import Http404

from enterprise_data.admin_analytics.database import run_query
from enterprise_data.utils import timer

LOGGER = getLogger(__name__)


def get_select_query(table: str, columns: list, enterprise_uuid: str) -> str:
"""
Generate a SELECT query for the given table and columns.
Arguments:
table (str): The table to query.
columns (list): The columns to select.
enterprise_uuid (str): The UUID of the enterprise customer.
Returns:
(str): The SELECT query.
"""
return f'SELECT {", ".join(columns)} FROM {table} WHERE enterprise_customer_uuid = "{enterprise_uuid}"'


def fetch_enrollment_data(enterprise_uuid: str):
"""
Fetch enrollment data from the database for the given enterprise customer.
Arguments:
enterprise_uuid (str): The UUID of the enterprise customer.
Returns:
(pandas.DataFrame): The enrollment data.
"""
enterprise_uuid = enterprise_uuid.replace('-', '')

columns = [
'enterprise_customer_name',
'enterprise_customer_uuid',
'lms_enrollment_id',
'user_id',
'email',
'course_key',
'courserun_key',
'course_id',
'course_subject',
'course_title',
'enterprise_enrollment_date',
'lms_enrollment_mode',
'enroll_type',
'program_title',
'date_certificate_awarded',
'grade_percent',
'cert_awarded',
'date_certificate_created_raw',
'passed_date_raw',
'passed_date',
'has_passed',
]
query = get_select_query(
table='fact_enrollment_admin_dash',
columns=columns,
enterprise_uuid=enterprise_uuid,
)

with timer('fetch_enrollment_data'):
results = run_query(query=query)

if not results:
raise Http404(f'No enrollment data found for enterprise {enterprise_uuid}')

LOGGER.info(f'[PLOTLY] Enrollment data fetched successfully. Records: {len(results)}')
enrollments = pandas.DataFrame(numpy.array(results), columns=columns)
LOGGER.info('[PLOTLY] Enrollment data converted to DataFrame.')

# Convert date columns to datetime.
enrollments['enterprise_enrollment_date'] = enrollments['enterprise_enrollment_date'].astype('datetime64[ns]')
enrollments['date_certificate_awarded'] = enrollments['date_certificate_awarded'].astype('datetime64[ns]')
enrollments['date_certificate_created_raw'] = enrollments['date_certificate_created_raw'].astype('datetime64[ns]')
enrollments['passed_date_raw'] = enrollments['passed_date_raw'].astype('datetime64[ns]')
enrollments['passed_date'] = enrollments['passed_date'].astype('datetime64[ns]')

return enrollments


def fetch_engagement_data(enterprise_uuid: str):
"""
Fetch engagement data from the database for the given enterprise customer.
Arguments:
enterprise_uuid (str): The UUID of the enterprise customer.
Returns:
(pandas.DataFrame): The engagement data.
"""
enterprise_uuid = enterprise_uuid.replace('-', '')

columns = [
'user_id',
'email',
'enterprise_customer_uuid',
'course_key',
'enroll_type',
'activity_date',
'course_title',
'course_subject',
'is_engaged',
'is_engaged_video',
'is_engaged_forum',
'is_engaged_problem',
'is_active',
'learning_time_seconds',
]
query = get_select_query(
table='fact_enrollment_engagement_day_admin_dash', columns=columns, enterprise_uuid=enterprise_uuid
)

with timer('fetch_engagement_data'):
results = run_query(query=query)
if not results:
raise Http404(f'No engagement data found for enterprise {enterprise_uuid}')

LOGGER.info(f'[PLOTLY] Engagement data fetched successfully. Records: {len(results)}')
engagement = pandas.DataFrame(numpy.array(results), columns=columns)
LOGGER.info('[PLOTLY] Engagement data converted to DataFrame.')
engagement['activity_date'] = engagement['activity_date'].astype('datetime64[ns]')

return engagement


def fetch_max_enrollment_datetime():
"""
Fetch the latest created date from the enterprise_learner_enrollment table.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,88 @@ def get_engagement_time_series_data_query():
GROUP BY activity_date, enroll_type
ORDER BY activity_date;
"""

@staticmethod
def get_all_leaderboard_data_query():
"""
Get the query to fetch the leaderboard data.
Query should fetch the leaderboard data for the enterprise customer to show in the data table.
Returns:
(str): Query to fetch the leaderboard data.
"""
return """
WITH Engagement AS (
SELECT
email,
ROUND(SUM(learning_time_seconds) / 3600, 1) as learning_time_hours,
SUM(is_engaged) as session_count,
CASE
WHEN SUM(is_engaged) = 0 THEN 0.0
ELSE ROUND(SUM(learning_time_seconds) / 3600 / SUM(is_engaged), 1)
END AS average_session_length
FROM fact_enrollment_engagement_day_admin_dash
WHERE enterprise_customer_uuid=%(enterprise_customer_uuid)s AND
(activity_date BETWEEN %(start_date)s AND %(end_date)s) AND
is_engaged = 1
GROUP BY email
),
Completions AS (
SELECT email, count(course_key) as course_completion_count
FROM fact_enrollment_admin_dash
WHERE enterprise_customer_uuid=%(enterprise_customer_uuid)s AND
(passed_date BETWEEN %(start_date)s AND %(end_date)s) AND
has_passed = 1
GROUP BY email
)
SELECT
Engagement.email,
Engagement.learning_time_hours,
Engagement.session_count,
Engagement.average_session_length,
Completions.course_completion_count
FROM Engagement
LEFT JOIN Completions
ON Engagement.email = Completions.email
ORDER BY
Engagement.learning_time_hours DESC,
Engagement.session_count DESC,
Completions.course_completion_count DESC
LIMIT %(limit)s OFFSET %(offset)s;
"""

@staticmethod
def get_leaderboard_data_count_query():
"""
Get the query to fetch the leaderboard row count.
Query should fetch the count of rows for the leaderboard data for the enterprise customer.
Returns:
(str): Query to fetch the leaderboard row count.
"""
return """
WITH Engagement AS (
SELECT
email
FROM fact_enrollment_engagement_day_admin_dash
WHERE enterprise_customer_uuid=%(enterprise_customer_uuid)s AND
(activity_date BETWEEN %(start_date)s AND %(end_date)s) AND
is_engaged = 1
GROUP BY email
),
Completions AS (
SELECT email, count(course_key) as course_completion_count
FROM fact_enrollment_admin_dash
WHERE enterprise_customer_uuid=%(enterprise_customer_uuid)s AND
(passed_date BETWEEN %(start_date)s AND %(end_date)s) AND
has_passed = 1
GROUP BY email
)
SELECT
count(*)
FROM Engagement
LEFT JOIN Completions
ON Engagement.email = Completions.email
"""
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,53 @@ def get_engagement_time_series_data(self, enterprise_customer_uuid: UUID, start_
},
as_dict=True,
)

def get_all_leaderboard_data(
self, enterprise_customer_uuid: UUID, start_date: date, end_date: date, limit: int, offset: int
):
"""
Get the leaderboard data for the given enterprise customer.
Arguments:
enterprise_customer_uuid (UUID): The UUID of the enterprise customer.
start_date (date): The start date.
end_date (date): The end date.
limit (int): The maximum number of records to return.
offset (int): The number of records to skip.
Returns:
list[dict]: The leaderboard data.
"""
return run_query(
query=self.queries.get_all_leaderboard_data_query(),
params={
'enterprise_customer_uuid': enterprise_customer_uuid,
'start_date': start_date,
'end_date': end_date,
'limit': limit,
'offset': offset,
},
as_dict=True,
)

def get_leaderboard_data_count(self, enterprise_customer_uuid: UUID, start_date: date, end_date: date):
"""
Get the total number of leaderboard records for the given enterprise customer.
Arguments:
enterprise_customer_uuid (UUID): The UUID of the enterprise customer.
start_date (date): The start date.
end_date (date): The end date.
Returns:
(int): The total number of leaderboard records.
"""
results = run_query(
query=self.queries.get_leaderboard_data_count_query(),
params={
'enterprise_customer_uuid': enterprise_customer_uuid,
'start_date': start_date,
'end_date': end_date,
}
)
return results[0][0]
Loading

0 comments on commit a3eca81

Please sign in to comment.