version 0.11

IHEC · Oct 3, 2022 · c9f4a59 · c9f4a59
1 parent d89aa91
commit c9f4a59
Show file tree

Hide file tree

Showing 15 changed files with 18,669 additions and 0 deletions.
diff --git a/openrefine/v0.11/.Rprofile b/openrefine/v0.11/.Rprofile
@@ -0,0 +1 @@
+source("renv/activate.R")
diff --git a/openrefine/v0.11/IHEC_metadata_harmonization.v0.11.csv b/openrefine/v0.11/IHEC_metadata_harmonization.v0.11.csv
diff --git a/openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.csv b/openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.csv
diff --git a/openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.intermediate.csv b/openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.intermediate.csv
diff --git a/openrefine/v0.11/add_higher_order_v0.11.R b/openrefine/v0.11/add_higher_order_v0.11.R
@@ -0,0 +1,135 @@
+#Testing first CL ontologies
+renv::restore()
+
+library(rols)
+library(data.table)
+source('myAncestors.R')
+
+#loading IHEC df
+original_dt <- fread('IHEC_metadata_harmonization.v0.11.extended.intermediate.csv')
+separator <- '::'
+
+###Ontologies
+#cl ontology
+cl <- Ontology("cl")
+efo <- Ontology("efo")
+uberon <- Ontology("uberon")
+ncit <- Ontology("ncit")
+
+# this adds one column per entry in ls to the main_dt containing higher_level annotations with overall at most l terms (for each l in ls)
+add_higher_order <- function(main_dt, rows_of_interest=rep(TRUE, times=main_dt[, .N]), ls = c(15, 30)){
+
+  ls <- sort(ls, decreasing = TRUE)
+
+  # df_ihec[aggregated_ancestors, on=.(sample_ontology_curie), sample_ontology_ancestors:=ancestors]
+  # ancestor_occurences <- as.data.table(df_ihec[, sort(table(unlist(sample_ontology_ancestors)))])
+  # setnames(ancestor_occurences, c('V1', 'N'), c('term', 'counts'))
+  # ancestor2occurences <- ancestor_occurences[, counts]
+  # names(ancestor2occurences) <- ancestor_occurences[, term]
+  # cutoff <- 100
+  # ggplot(ancestor_occurences[counts >= cutoff], aes(y = reorder(term, counts), x = counts)) + geom_bar(stat = 'identity') + labs(y = 'term', title=paste('ancestor terms with >=', cutoff, 'occurences in all sample_ontology_entries'))
+  # ggplot(ancestor_occurences, aes(x = counts)) + geom_histogram() + labs(title=paste('occurence distribution of terms in all sample_ontology_entries'))
+
+
+
+  ancestor_occurences <- as.data.table(main_dt[rows_of_interest, sort(table(unlist(ancestors)))])
+  setnames(ancestor_occurences, c('V1', 'N'), c('term', 'counts'))
+  unique_ancestor2occurences <- ancestor_occurences[, counts]
+  names(unique_ancestor2occurences) <- ancestor_occurences[, term]
+
+  main_dt[rows_of_interest, ancestors:=sapply(ancestors, function(a) list(a[order(unique_ancestor2occurences[a])]))]
+
+  last_k <- 1L
+  for (l in ls) {
+    for (k in seq.int(last_k, sum(rows_of_interest, na.rm = TRUE))) {
+      new_col <-
+        main_dt[rows_of_interest, sapply(ancestors, function(ancestor_vector){
+          if(is.null(ancestor_vector)) return(NA)
+          ancestor_vector[tryCatch(
+            max(which(unique_ancestor2occurences[ancestor_vector] <= k)),
+            warning = function(w)
+              ifelse(w$message == 'no non-missing arguments to max; returning -Inf', 1, w)
+          )]
+        }
+        )]
+      if (l >= uniqueN(new_col)) {
+        if (sum(rows_of_interest, na.rm = TRUE) != length(new_col))
+          stop('new column has different length than number of rows')
+        main_dt[rows_of_interest, (paste('order', l, sep='_')):=new_col]
+        last_k <- k
+        break
+      }
+    }
+  }
+
+  main_dt
+}
+
+# now go trough the curie cols ----
+metadata_dt <- copy(original_dt)
+
+ncit_cols <- names(original_dt)[endsWith(names(original_dt), '_ncit')]
+for (curie_col in c('sample_ontology_curie', ncit_cols)){
+  message(curie_col)
+  spread_curies <- original_dt[, .(single_curie=unlist(tstrsplit(get(curie_col), separator, fixed=TRUE))), by=mget(curie_col)]
+  if (curie_col == 'sample_ontology_curie') {
+    spread_curies[, sample_ontology:=tstrsplit(single_curie, ':', fixed=TRUE, keep = 1)]
+    spread_curies[sample_ontology=='CL', term:=sapply(single_curie, term, object=cl)]
+    spread_curies[sample_ontology=='EFO', term:=sapply(single_curie, term, object=efo)]
+    spread_curies[sample_ontology=='UBERON', term:=sapply(single_curie, term, object=uberon)]
+    spread_curies[, sample_ontology:=as.factor(sample_ontology)]
+  } else {
+    spread_curies[, term:=sapply(single_curie, term, object=ncit)]
+  }
+  spread_curies[, term_name:=sapply(term, function(t) unname(termLabel(t)))]
+  spread_curies[, ancestors:=lapply(term, function(t) {
+    ancestors <- termLabel(myAncestors(t))
+    ancestors <- ancestors[startsWith(names(ancestors), toupper(termOntology(t)))]
+    c(termLabel(t), ancestors)
+  })]
+  if (curie_col == 'sample_ontology_curie') {
+    aggregated_ancestors <- spread_curies[, .(term_name=paste(sort(term_name), collapse = separator), ancestors=list(unique(unlist(Reduce(intersect, ancestors))))), 
+                                          by=.(sample_ontology, sample_ontology_curie)]
+    metadata_dt[aggregated_ancestors, on=.(sample_ontology_curie), (c('sample_ontology', 'sample_ontology_term')):=mget(c('sample_ontology', 'term_name'))]
+    metadata_dt[, sample_ontology:=as.factor(sample_ontology)]
+    for (this_ontology in aggregated_ancestors[, levels(sample_ontology)]) {
+      add_higher_order(aggregated_ancestors, rows_of_interest = aggregated_ancestors[, sample_ontology == this_ontology])
+    }
+    print(aggregated_ancestors[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30)), by=sample_ontology])
+  } else {
+    aggregated_ancestors <- spread_curies[, .(term_name=paste(sort(term_name), collapse = separator), ancestors=list(unique(unlist(Reduce(intersect, ancestors))))), 
+                                          by=mget(curie_col)]
+    add_higher_order(aggregated_ancestors)
+    print(aggregated_ancestors[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30))])
+  }
+  cols_to_add <- names(aggregated_ancestors)[startsWith(names(aggregated_ancestors), 'order')]
+  cure_col_term <- sub('curie', 'term', curie_col, fixed = TRUE)
+  metadata_dt[aggregated_ancestors, on=curie_col, (c(paste(cure_col_term, cols_to_add, 'unique', sep = '_'), 'ancestors')):=mget(c(cols_to_add, 'ancestors'))]
+  if (curie_col == 'sample_ontology_curie') {
+    non_matching_manual <- metadata_dt[!mapply(function(term, a) term %in% a, sample_ontology_term_high_order_manual, ancestors)]
+    if(nrow(non_matching_manual) > 0)
+      print(non_matching_manual[, .(EpiRR, project, biomaterial_type, sample_ontology_curie, cell_type, tissue_type, line, sample_ontology_term_high_order_manual)])
+    for (this_ontology in metadata_dt[, levels(sample_ontology)]) {
+      add_higher_order(metadata_dt, rows_of_interest = metadata_dt[, sample_ontology == this_ontology])
+    }
+    print(metadata_dt[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30)), by=sample_ontology])
+  } else {
+    if (curie_col == 'disease_ontology_curie_ncit') {
+      non_matching_manual <- metadata_dt[!mapply(function(term, a) term %in% a, disease_intermediate_order_manual, ancestors)]
+      if(nrow(non_matching_manual)>0){
+        browser()
+        print(non_matching_manual[, .(EpiRR, project, biomaterial_type, disease, disease_ontology_curie, disease_ontology_curie_ncit, disease_intermediate_order_manual, disease_high_order_manual)])
+      }
+    }
+    add_higher_order(metadata_dt)
+    print(metadata_dt[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30))])
+  }
+  metadata_dt[, ancestors:=NULL]
+  cols_to_rename <- names(metadata_dt)[startsWith(names(metadata_dt), 'order')]
+  setnames(metadata_dt, cols_to_rename, paste(cure_col_term, cols_to_rename, sep = '_'))
+}
+
+names(metadata_dt) <- gsub('(ncit_)?order_15', 'high_order', names(metadata_dt))
+names(metadata_dt) <- gsub('(ncit_)?order_30', 'intermediate_order', names(metadata_dt))
+
+fwrite(metadata_dt, file = 'IHEC_metadata_harmonization.v0.11.extended.csv')
diff --git a/openrefine/v0.11/create_v0.11.py b/openrefine/v0.11/create_v0.11.py
@@ -0,0 +1,109 @@
+import os.path
+import urllib
+from subprocess import run
+
+import pandas as pd
+
+# make sure the working directory when running this file is the project root of the git project
+os.chdir('../../')
+
+# create openrefine project and apply rules - OPENREFINE SERVER HAS TO BE RUNNING
+# creating openrefine projects via the openrefine-client needs a csv as input in order to work properly
+openrefine_client = './openrefine/openrefine-client_0-3-10_linux'  # path to the openrefine executable
+initial_csv = './openrefine/v0.10/IHEC_metadata_harmonization.v0.10.extended.csv'  # csv to build project from
+
+# create project with intermediate version
+intermediate_project_name = os.path.splitext(os.path.basename(initial_csv))[0]
+# run([openrefine_client, '--delete', intermediate_project_name], check=True)
+run([openrefine_client, '--create', initial_csv], check=True)
+
+# here we manually solve some mapping issues and conflicts and the resulting json is then used in this script
+
+run([openrefine_client, '--apply', 'openrefine/v0.11/fixing_inconsistencies.json', intermediate_project_name],
+    check=True)
+run([openrefine_client, '--apply', 'openrefine/v0.11/fix_other_and_blank.json', intermediate_project_name],
+    check=True)
+
+v0_11_extended_intermediate_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.intermediate.csv'
+run([openrefine_client, '--export', f'--output={v0_11_extended_intermediate_csv}', intermediate_project_name],
+    check=True)
+
+v0_11_extended_intermediate = pd.read_csv(v0_11_extended_intermediate_csv)
+epirr_all = urllib.request.urlopen('https://www.ebi.ac.uk/vg/epirr/view/all?format=json').read()
+epirr_all_dt = pd.read_json(epirr_all)
+epirr_all_dt.rename(columns={'type': 'donor_type'}, inplace=True)
+v0_11_extended_intermediate_merged = pd.merge(
+    v0_11_extended_intermediate,
+    epirr_all_dt[['full_accession', 'donor_type']],
+    how="inner",
+    left_on='EpiRR',
+    right_on='full_accession',
+    validate='one_to_one'
+)
+v0_11_extended_intermediate_merged.drop(columns='full_accession', inplace=True)
+
+assert (len(v0_11_extended_intermediate) == len(v0_11_extended_intermediate_merged))
+automatic_higher_level = v0_11_extended_intermediate_merged.columns.str.endswith(
+    'order') | v0_11_extended_intermediate_merged.columns.str.endswith('unique')
+v0_11_extended_intermediate_merged.loc[:, ~automatic_higher_level].to_csv(v0_11_extended_intermediate_csv, index=False)
+
+run(['Rscript', 'add_higher_order_v0.11.R'], check=True, cwd='./openrefine/v0.11')
+
+v0_11_extended_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.csv'  # csv to build project from
+v0_11_extended = pd.read_csv(v0_11_extended_csv)
+v0_11_extended.sort_values(by='EpiRR', inplace=True)
+
+renaming_dict = {'EpiRR': 'EpiRR',
+                 'EpiRR_status': 'EpiRR_status',
+                 'project': 'project',
+                 'biomaterial_type': 'harm_biomaterial_type',
+                 'line': 'harm_line',
+                 'markers': 'harm_markers',
+                 'cell_type': 'harm_cell_type',
+                 'tissue_type': 'harm_tissue_type',
+                 'sample_ontology_curie': 'harm_sample_ontology_curie',
+                 'disease': 'harm_disease',
+                 'disease_ontology_curie': 'harm_disease_ontology_curie',
+                 'donor_age': 'harm_donor_age',
+                 'donor_age_unit': 'harm_donor_age_unit',
+                 'donor_health_status': 'harm_donor_health_status',
+                 'donor_health_status_ontology_curie': 'harm_donor_health_status_ontology_curie',
+                 'donor_id': 'harm_donor_id',
+                 'donor_life_stage': 'harm_donor_life_stage',
+                 'health_state': 'harm_donor_life_status',
+                 'sex': 'harm_donor_sex',
+                 'sample_ontology_term_high_order_manual': 'harm_sample_ontology_intermediate',
+                 'disease_high_order_manual': 'harm_disease_high',
+                 'disease_intermediate_order_manual': 'harm_disease_intermediate',
+                 'donor_type': 'donor_type'}
+
+v0_11_extended.rename(columns=renaming_dict, inplace=True)
+v0_11_extended = v0_11_extended[['EpiRR', 'project', 'harm_biomaterial_type', 'harm_sample_ontology_intermediate', 'harm_disease_high', 'harm_disease_intermediate',
+     'EpiRR_status', 'harm_cell_type', 'harm_line', 'harm_tissue_type', 'harm_sample_ontology_curie', 'harm_markers',
+     'sample_ontology', 'sample_ontology_term', 'sample_ontology_term_high_order_JeffreyHyacinthe', 'sample_ontology_term_high_order_JonathanSteif', 'sample_ontology_term_intermediate_order_unique', 'sample_ontology_term_high_order_unique', 'sample_ontology_term_intermediate_order', 'sample_ontology_term_high_order',
+     'harm_disease', 'harm_disease_ontology_curie', 'disease_ontology_curie_ncit', 'disease_ontology_term_intermediate_order_unique', 'disease_ontology_term_high_order_unique', 'disease_ontology_term_intermediate_order', 'disease_ontology_term_high_order',
+     'donor_type', 'harm_donor_id', 'harm_donor_age', 'harm_donor_age_unit', 'harm_donor_life_stage', 'harm_donor_sex',
+     'harm_donor_health_status', 'harm_donor_health_status_ontology_curie', 'donor_health_status_ontology_curie_ncit', 'donor_health_status_ontology_term_intermediate_order_unique', 'donor_health_status_ontology_term_high_order_unique', 'donor_health_status_ontology_term_intermediate_order', 'donor_health_status_ontology_term_high_order',
+     'harm_donor_life_status']]
+
+v0_11_extended.to_csv(v0_11_extended_csv, index=False)
+
+final_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.csv'
+
+v0_11_extended.loc[:, v0_11_extended.columns.isin(renaming_dict.values())].to_csv(final_csv, index=False)
+
+old = pd.read_csv(initial_csv)
+old.index = old.EpiRR
+old.sort_index(0, inplace=True)
+old.sort_index(1, inplace=True)
+new = pd.read_csv(v0_11_extended_csv)
+new.index = new.EpiRR
+new.rename(columns={v: k for k, v in renaming_dict.items()}, inplace=True)
+new.drop(columns=['donor_type'], inplace=True)
+new.sort_index(0, inplace=True)
+new.sort_index(1, inplace=True)
+
+diff_tbl = old.compare(new)
+diff_tbl.rename(columns={'self': 'v0.10', 'other': 'v0.11'}, inplace=True)
+diff_tbl.rename(columns={k: k + ':' + v for k, v in renaming_dict.items()}, inplace=True)
+diff_tbl.apply(lambda x: [x.dropna()], axis=1).to_json('openrefine/v0.11/diff_v0.10_v0.11.json', indent=True)