Skip to content

Commit

Permalink
version 0.11
Browse files Browse the repository at this point in the history
  • Loading branch information
quirinmanz committed Oct 3, 2022
1 parent d89aa91 commit c9f4a59
Show file tree
Hide file tree
Showing 15 changed files with 18,669 additions and 0 deletions.
1 change: 1 addition & 0 deletions openrefine/v0.11/.Rprofile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
source("renv/activate.R")
2,659 changes: 2,659 additions & 0 deletions openrefine/v0.11/IHEC_metadata_harmonization.v0.11.csv

Large diffs are not rendered by default.

2,659 changes: 2,659 additions & 0 deletions openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.csv

Large diffs are not rendered by default.

2,659 changes: 2,659 additions & 0 deletions openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.intermediate.csv

Large diffs are not rendered by default.

135 changes: 135 additions & 0 deletions openrefine/v0.11/add_higher_order_v0.11.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#Testing first CL ontologies
renv::restore()

library(rols)
library(data.table)
source('myAncestors.R')

#loading IHEC df
original_dt <- fread('IHEC_metadata_harmonization.v0.11.extended.intermediate.csv')
separator <- '::'

###Ontologies
#cl ontology
cl <- Ontology("cl")
efo <- Ontology("efo")
uberon <- Ontology("uberon")
ncit <- Ontology("ncit")

# this adds one column per entry in ls to the main_dt containing higher_level annotations with overall at most l terms (for each l in ls)
add_higher_order <- function(main_dt, rows_of_interest=rep(TRUE, times=main_dt[, .N]), ls = c(15, 30)){

ls <- sort(ls, decreasing = TRUE)

# df_ihec[aggregated_ancestors, on=.(sample_ontology_curie), sample_ontology_ancestors:=ancestors]
# ancestor_occurences <- as.data.table(df_ihec[, sort(table(unlist(sample_ontology_ancestors)))])
# setnames(ancestor_occurences, c('V1', 'N'), c('term', 'counts'))
# ancestor2occurences <- ancestor_occurences[, counts]
# names(ancestor2occurences) <- ancestor_occurences[, term]
# cutoff <- 100
# ggplot(ancestor_occurences[counts >= cutoff], aes(y = reorder(term, counts), x = counts)) + geom_bar(stat = 'identity') + labs(y = 'term', title=paste('ancestor terms with >=', cutoff, 'occurences in all sample_ontology_entries'))
# ggplot(ancestor_occurences, aes(x = counts)) + geom_histogram() + labs(title=paste('occurence distribution of terms in all sample_ontology_entries'))



ancestor_occurences <- as.data.table(main_dt[rows_of_interest, sort(table(unlist(ancestors)))])
setnames(ancestor_occurences, c('V1', 'N'), c('term', 'counts'))
unique_ancestor2occurences <- ancestor_occurences[, counts]
names(unique_ancestor2occurences) <- ancestor_occurences[, term]

main_dt[rows_of_interest, ancestors:=sapply(ancestors, function(a) list(a[order(unique_ancestor2occurences[a])]))]

last_k <- 1L
for (l in ls) {
for (k in seq.int(last_k, sum(rows_of_interest, na.rm = TRUE))) {
new_col <-
main_dt[rows_of_interest, sapply(ancestors, function(ancestor_vector){
if(is.null(ancestor_vector)) return(NA)
ancestor_vector[tryCatch(
max(which(unique_ancestor2occurences[ancestor_vector] <= k)),
warning = function(w)
ifelse(w$message == 'no non-missing arguments to max; returning -Inf', 1, w)
)]
}
)]
if (l >= uniqueN(new_col)) {
if (sum(rows_of_interest, na.rm = TRUE) != length(new_col))
stop('new column has different length than number of rows')
main_dt[rows_of_interest, (paste('order', l, sep='_')):=new_col]
last_k <- k
break
}
}
}

main_dt
}

# now go trough the curie cols ----
metadata_dt <- copy(original_dt)

ncit_cols <- names(original_dt)[endsWith(names(original_dt), '_ncit')]
for (curie_col in c('sample_ontology_curie', ncit_cols)){
message(curie_col)
spread_curies <- original_dt[, .(single_curie=unlist(tstrsplit(get(curie_col), separator, fixed=TRUE))), by=mget(curie_col)]
if (curie_col == 'sample_ontology_curie') {
spread_curies[, sample_ontology:=tstrsplit(single_curie, ':', fixed=TRUE, keep = 1)]
spread_curies[sample_ontology=='CL', term:=sapply(single_curie, term, object=cl)]
spread_curies[sample_ontology=='EFO', term:=sapply(single_curie, term, object=efo)]
spread_curies[sample_ontology=='UBERON', term:=sapply(single_curie, term, object=uberon)]
spread_curies[, sample_ontology:=as.factor(sample_ontology)]
} else {
spread_curies[, term:=sapply(single_curie, term, object=ncit)]
}
spread_curies[, term_name:=sapply(term, function(t) unname(termLabel(t)))]
spread_curies[, ancestors:=lapply(term, function(t) {
ancestors <- termLabel(myAncestors(t))
ancestors <- ancestors[startsWith(names(ancestors), toupper(termOntology(t)))]
c(termLabel(t), ancestors)
})]
if (curie_col == 'sample_ontology_curie') {
aggregated_ancestors <- spread_curies[, .(term_name=paste(sort(term_name), collapse = separator), ancestors=list(unique(unlist(Reduce(intersect, ancestors))))),
by=.(sample_ontology, sample_ontology_curie)]
metadata_dt[aggregated_ancestors, on=.(sample_ontology_curie), (c('sample_ontology', 'sample_ontology_term')):=mget(c('sample_ontology', 'term_name'))]
metadata_dt[, sample_ontology:=as.factor(sample_ontology)]
for (this_ontology in aggregated_ancestors[, levels(sample_ontology)]) {
add_higher_order(aggregated_ancestors, rows_of_interest = aggregated_ancestors[, sample_ontology == this_ontology])
}
print(aggregated_ancestors[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30)), by=sample_ontology])
} else {
aggregated_ancestors <- spread_curies[, .(term_name=paste(sort(term_name), collapse = separator), ancestors=list(unique(unlist(Reduce(intersect, ancestors))))),
by=mget(curie_col)]
add_higher_order(aggregated_ancestors)
print(aggregated_ancestors[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30))])
}
cols_to_add <- names(aggregated_ancestors)[startsWith(names(aggregated_ancestors), 'order')]
cure_col_term <- sub('curie', 'term', curie_col, fixed = TRUE)
metadata_dt[aggregated_ancestors, on=curie_col, (c(paste(cure_col_term, cols_to_add, 'unique', sep = '_'), 'ancestors')):=mget(c(cols_to_add, 'ancestors'))]
if (curie_col == 'sample_ontology_curie') {
non_matching_manual <- metadata_dt[!mapply(function(term, a) term %in% a, sample_ontology_term_high_order_manual, ancestors)]
if(nrow(non_matching_manual) > 0)
print(non_matching_manual[, .(EpiRR, project, biomaterial_type, sample_ontology_curie, cell_type, tissue_type, line, sample_ontology_term_high_order_manual)])
for (this_ontology in metadata_dt[, levels(sample_ontology)]) {
add_higher_order(metadata_dt, rows_of_interest = metadata_dt[, sample_ontology == this_ontology])
}
print(metadata_dt[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30)), by=sample_ontology])
} else {
if (curie_col == 'disease_ontology_curie_ncit') {
non_matching_manual <- metadata_dt[!mapply(function(term, a) term %in% a, disease_intermediate_order_manual, ancestors)]
if(nrow(non_matching_manual)>0){
browser()
print(non_matching_manual[, .(EpiRR, project, biomaterial_type, disease, disease_ontology_curie, disease_ontology_curie_ncit, disease_intermediate_order_manual, disease_high_order_manual)])
}
}
add_higher_order(metadata_dt)
print(metadata_dt[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30))])
}
metadata_dt[, ancestors:=NULL]
cols_to_rename <- names(metadata_dt)[startsWith(names(metadata_dt), 'order')]
setnames(metadata_dt, cols_to_rename, paste(cure_col_term, cols_to_rename, sep = '_'))
}

names(metadata_dt) <- gsub('(ncit_)?order_15', 'high_order', names(metadata_dt))
names(metadata_dt) <- gsub('(ncit_)?order_30', 'intermediate_order', names(metadata_dt))

fwrite(metadata_dt, file = 'IHEC_metadata_harmonization.v0.11.extended.csv')
109 changes: 109 additions & 0 deletions openrefine/v0.11/create_v0.11.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os.path
import urllib
from subprocess import run

import pandas as pd

# make sure the working directory when running this file is the project root of the git project
os.chdir('../../')

# create openrefine project and apply rules - OPENREFINE SERVER HAS TO BE RUNNING
# creating openrefine projects via the openrefine-client needs a csv as input in order to work properly
openrefine_client = './openrefine/openrefine-client_0-3-10_linux' # path to the openrefine executable
initial_csv = './openrefine/v0.10/IHEC_metadata_harmonization.v0.10.extended.csv' # csv to build project from

# create project with intermediate version
intermediate_project_name = os.path.splitext(os.path.basename(initial_csv))[0]
# run([openrefine_client, '--delete', intermediate_project_name], check=True)
run([openrefine_client, '--create', initial_csv], check=True)

# here we manually solve some mapping issues and conflicts and the resulting json is then used in this script

run([openrefine_client, '--apply', 'openrefine/v0.11/fixing_inconsistencies.json', intermediate_project_name],
check=True)
run([openrefine_client, '--apply', 'openrefine/v0.11/fix_other_and_blank.json', intermediate_project_name],
check=True)

v0_11_extended_intermediate_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.intermediate.csv'
run([openrefine_client, '--export', f'--output={v0_11_extended_intermediate_csv}', intermediate_project_name],
check=True)

v0_11_extended_intermediate = pd.read_csv(v0_11_extended_intermediate_csv)
epirr_all = urllib.request.urlopen('https://www.ebi.ac.uk/vg/epirr/view/all?format=json').read()
epirr_all_dt = pd.read_json(epirr_all)
epirr_all_dt.rename(columns={'type': 'donor_type'}, inplace=True)
v0_11_extended_intermediate_merged = pd.merge(
v0_11_extended_intermediate,
epirr_all_dt[['full_accession', 'donor_type']],
how="inner",
left_on='EpiRR',
right_on='full_accession',
validate='one_to_one'
)
v0_11_extended_intermediate_merged.drop(columns='full_accession', inplace=True)

assert (len(v0_11_extended_intermediate) == len(v0_11_extended_intermediate_merged))
automatic_higher_level = v0_11_extended_intermediate_merged.columns.str.endswith(
'order') | v0_11_extended_intermediate_merged.columns.str.endswith('unique')
v0_11_extended_intermediate_merged.loc[:, ~automatic_higher_level].to_csv(v0_11_extended_intermediate_csv, index=False)

run(['Rscript', 'add_higher_order_v0.11.R'], check=True, cwd='./openrefine/v0.11')

v0_11_extended_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.csv' # csv to build project from
v0_11_extended = pd.read_csv(v0_11_extended_csv)
v0_11_extended.sort_values(by='EpiRR', inplace=True)

renaming_dict = {'EpiRR': 'EpiRR',
'EpiRR_status': 'EpiRR_status',
'project': 'project',
'biomaterial_type': 'harm_biomaterial_type',
'line': 'harm_line',
'markers': 'harm_markers',
'cell_type': 'harm_cell_type',
'tissue_type': 'harm_tissue_type',
'sample_ontology_curie': 'harm_sample_ontology_curie',
'disease': 'harm_disease',
'disease_ontology_curie': 'harm_disease_ontology_curie',
'donor_age': 'harm_donor_age',
'donor_age_unit': 'harm_donor_age_unit',
'donor_health_status': 'harm_donor_health_status',
'donor_health_status_ontology_curie': 'harm_donor_health_status_ontology_curie',
'donor_id': 'harm_donor_id',
'donor_life_stage': 'harm_donor_life_stage',
'health_state': 'harm_donor_life_status',
'sex': 'harm_donor_sex',
'sample_ontology_term_high_order_manual': 'harm_sample_ontology_intermediate',
'disease_high_order_manual': 'harm_disease_high',
'disease_intermediate_order_manual': 'harm_disease_intermediate',
'donor_type': 'donor_type'}

v0_11_extended.rename(columns=renaming_dict, inplace=True)
v0_11_extended = v0_11_extended[['EpiRR', 'project', 'harm_biomaterial_type', 'harm_sample_ontology_intermediate', 'harm_disease_high', 'harm_disease_intermediate',
'EpiRR_status', 'harm_cell_type', 'harm_line', 'harm_tissue_type', 'harm_sample_ontology_curie', 'harm_markers',
'sample_ontology', 'sample_ontology_term', 'sample_ontology_term_high_order_JeffreyHyacinthe', 'sample_ontology_term_high_order_JonathanSteif', 'sample_ontology_term_intermediate_order_unique', 'sample_ontology_term_high_order_unique', 'sample_ontology_term_intermediate_order', 'sample_ontology_term_high_order',
'harm_disease', 'harm_disease_ontology_curie', 'disease_ontology_curie_ncit', 'disease_ontology_term_intermediate_order_unique', 'disease_ontology_term_high_order_unique', 'disease_ontology_term_intermediate_order', 'disease_ontology_term_high_order',
'donor_type', 'harm_donor_id', 'harm_donor_age', 'harm_donor_age_unit', 'harm_donor_life_stage', 'harm_donor_sex',
'harm_donor_health_status', 'harm_donor_health_status_ontology_curie', 'donor_health_status_ontology_curie_ncit', 'donor_health_status_ontology_term_intermediate_order_unique', 'donor_health_status_ontology_term_high_order_unique', 'donor_health_status_ontology_term_intermediate_order', 'donor_health_status_ontology_term_high_order',
'harm_donor_life_status']]

v0_11_extended.to_csv(v0_11_extended_csv, index=False)

final_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.csv'

v0_11_extended.loc[:, v0_11_extended.columns.isin(renaming_dict.values())].to_csv(final_csv, index=False)

old = pd.read_csv(initial_csv)
old.index = old.EpiRR
old.sort_index(0, inplace=True)
old.sort_index(1, inplace=True)
new = pd.read_csv(v0_11_extended_csv)
new.index = new.EpiRR
new.rename(columns={v: k for k, v in renaming_dict.items()}, inplace=True)
new.drop(columns=['donor_type'], inplace=True)
new.sort_index(0, inplace=True)
new.sort_index(1, inplace=True)

diff_tbl = old.compare(new)
diff_tbl.rename(columns={'self': 'v0.10', 'other': 'v0.11'}, inplace=True)
diff_tbl.rename(columns={k: k + ':' + v for k, v in renaming_dict.items()}, inplace=True)
diff_tbl.apply(lambda x: [x.dropna()], axis=1).to_json('openrefine/v0.11/diff_v0.10_v0.11.json', indent=True)
Loading

0 comments on commit c9f4a59

Please sign in to comment.