Merge pull request #71 from Sage-Bionetworks/etl-521

[ETL-521] Run JSON to Parquet jobs in groups
Sage-Bionetworks · Aug 21, 2023 · fdf30bd · fdf30bd
2 parents ad5d307 + b20964f
commit fdf30bd
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 6 deletions.
diff --git a/templates/glue-job-JSONToParquet.j2 b/templates/glue-job-JSONToParquet.j2
@@ -40,7 +40,7 @@ Parameters:
   TimeoutInMinutes:
     Type: Number
     Description: The job timeout in minutes (integer).
-    Default: 120
+    Default: 720
 
   TempS3Bucket:
     Type: String

diff --git a/templates/glue-workflow.j2 b/templates/glue-workflow.j2
@@ -1,6 +1,15 @@
 AWSTemplateFormatVersion: '2010-09-09'
 
-Description: The primary workflow for processing RECOVER data
+Description: >-
+    The primary workflow for processing RECOVER data. An outline of the workflow is below:
+
+    S3 to JSON ->
+    (JSON to Parquet) EnrolledParticipants and SymptomLog ->
+    (JSON to Parquet) HealthKit ->
+    (JSON to Parquet) Fitbit ->
+    (JSON to Parquet) Google ->
+    (JSON to Parquet) Garmin ->
+    CompareParquetJob
 
 Parameters:
 
@@ -53,7 +62,8 @@ Resources:
   {% set datasets = [] %}
   {% for v in sceptre_user_data.dataset_schemas.tables.keys() if not "Deleted" in v %}
     {% set dataset = {} %}
-    {% do dataset.update({'table_name': 'dataset_' + v.lower()})%}
+    {% do dataset.update({'data_type': v}) %}
+    {% do dataset.update({'table_name': 'dataset_' + v.lower()}) %}
     {% do dataset.update({'stackname_prefix': '{}'.format(v.replace('_',''))}) %}
     {% do datasets.append(dataset) %}
   {% endfor %}
@@ -88,11 +98,11 @@ Resources:
     Properties:
       Name: !Sub "${Namespace}-S3ToJsonCompleteTrigger"
       Actions:
-        {% for dataset in datasets %}
+        {% for dataset in datasets if not "HealthKit" in dataset["data_type"] and not "Fitbit" in dataset["data_type"] and not "Google" in dataset["data_type"] and not "Garmin" in dataset["data_type"] %}
         - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
           Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
         {% endfor %}
-      Description: This trigger runs after completion of the S3 to JSON job.
+      Description: This trigger kicks off every JSON to Parquet job which is not associated with a device and runs after completion of the S3 to JSON job.
       Type: CONDITIONAL
       Predicate:
         Conditions:
@@ -102,6 +112,94 @@ Resources:
       StartOnCreation: true
       WorkflowName: !Ref PrimaryWorkflow
 
+  HealthKitTrigger:
+    Type: AWS::Glue::Trigger
+    Properties:
+      Name: !Sub "${Namespace}-HealthKitTrigger"
+      Actions:
+        {% for dataset in datasets if "HealthKit" in dataset["data_type"] %}
+        - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+          Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
+        {% endfor %}
+      Description: This trigger kicks off every JSON to Parquet job which is associated with a HealthKit data type.
+      Type: CONDITIONAL
+      Predicate:
+        Conditions:
+          {% for dataset in datasets if not "HealthKit" in dataset["data_type"] and not "Fitbit" in dataset["data_type"] and not "Google" in dataset["data_type"] and not "Garmin" in dataset["data_type"] %}
+          - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+            State: SUCCEEDED
+            LogicalOperator: EQUALS
+          {% endfor %}
+        Logical: AND
+      StartOnCreation: true
+      WorkflowName: !Ref PrimaryWorkflow
+
+  FitbitTrigger:
+    Type: AWS::Glue::Trigger
+    Properties:
+      Name: !Sub "${Namespace}-FitbitTrigger"
+      Actions:
+        {% for dataset in datasets if "Fitbit" in dataset["data_type"] %}
+        - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+          Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
+        {% endfor %}
+      Description: This trigger kicks off every JSON to Parquet job which is associated with a Fitbit data type.
+      Type: CONDITIONAL
+      Predicate:
+        Conditions:
+          {% for dataset in datasets if "HealthKit" in dataset["data_type"] %}
+          - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+            State: SUCCEEDED
+            LogicalOperator: EQUALS
+          {% endfor %}
+        Logical: AND
+      StartOnCreation: true
+      WorkflowName: !Ref PrimaryWorkflow
+
+  GoogleTrigger:
+    Type: AWS::Glue::Trigger
+    Properties:
+      Name: !Sub "${Namespace}-GoogleTrigger"
+      Actions:
+        {% for dataset in datasets if "Google" in dataset["data_type"] %}
+        - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+          Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
+        {% endfor %}
+      Description: This trigger kicks off every JSON to Parquet job which is associated with a Google data type.
+      Type: CONDITIONAL
+      Predicate:
+        Conditions:
+          {% for dataset in datasets if "Fitbit" in dataset["data_type"] %}
+          - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+            State: SUCCEEDED
+            LogicalOperator: EQUALS
+          {% endfor %}
+        Logical: AND
+      StartOnCreation: true
+      WorkflowName: !Ref PrimaryWorkflow
+
+  GarminTrigger:
+    Type: AWS::Glue::Trigger
+    Properties:
+      Name: !Sub "${Namespace}-GarminTrigger"
+      Actions:
+        {% for dataset in datasets if "Garmin" in dataset["data_type"] %}
+        - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+          Arguments: {"--glue-table": {{ "{}".format(dataset["table_name"]) }} }
+        {% endfor %}
+      Description: This trigger kicks off every JSON to Parquet job which is associated with a Garmin data type.
+      Type: CONDITIONAL
+      Predicate:
+        Conditions:
+          {% for dataset in datasets if "Google" in dataset["data_type"] %}
+          - JobName: !Sub ${Namespace}-{{ dataset["stackname_prefix"]}}-Job
+            State: SUCCEEDED
+            LogicalOperator: EQUALS
+          {% endfor %}
+        Logical: AND
+      StartOnCreation: true
+      WorkflowName: !Ref PrimaryWorkflow
+
   JsontoParquetCompleteTrigger:
     Type: AWS::Glue::Trigger
     Condition: IsStagingNamespace
@@ -121,7 +219,7 @@ Resources:
       Type: CONDITIONAL
       Predicate:
         Conditions:
-          {% for dataset in datasets %}
+          {% for dataset in datasets if "Garmin" in dataset["data_type"] %}
           - JobName: !Sub "${Namespace}-{{ dataset["stackname_prefix"] }}-Job"
             State: SUCCEEDED
             LogicalOperator: EQUALS