-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d89aa91
commit c9f4a59
Showing
15 changed files
with
18,669 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
source("renv/activate.R") |
2,659 changes: 2,659 additions & 0 deletions
2,659
openrefine/v0.11/IHEC_metadata_harmonization.v0.11.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2,659 changes: 2,659 additions & 0 deletions
2,659
openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2,659 changes: 2,659 additions & 0 deletions
2,659
openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.intermediate.csv
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#Testing first CL ontologies | ||
renv::restore() | ||
|
||
library(rols) | ||
library(data.table) | ||
source('myAncestors.R') | ||
|
||
#loading IHEC df | ||
original_dt <- fread('IHEC_metadata_harmonization.v0.11.extended.intermediate.csv') | ||
separator <- '::' | ||
|
||
###Ontologies | ||
#cl ontology | ||
cl <- Ontology("cl") | ||
efo <- Ontology("efo") | ||
uberon <- Ontology("uberon") | ||
ncit <- Ontology("ncit") | ||
|
||
# this adds one column per entry in ls to the main_dt containing higher_level annotations with overall at most l terms (for each l in ls) | ||
add_higher_order <- function(main_dt, rows_of_interest=rep(TRUE, times=main_dt[, .N]), ls = c(15, 30)){ | ||
|
||
ls <- sort(ls, decreasing = TRUE) | ||
|
||
# df_ihec[aggregated_ancestors, on=.(sample_ontology_curie), sample_ontology_ancestors:=ancestors] | ||
# ancestor_occurences <- as.data.table(df_ihec[, sort(table(unlist(sample_ontology_ancestors)))]) | ||
# setnames(ancestor_occurences, c('V1', 'N'), c('term', 'counts')) | ||
# ancestor2occurences <- ancestor_occurences[, counts] | ||
# names(ancestor2occurences) <- ancestor_occurences[, term] | ||
# cutoff <- 100 | ||
# ggplot(ancestor_occurences[counts >= cutoff], aes(y = reorder(term, counts), x = counts)) + geom_bar(stat = 'identity') + labs(y = 'term', title=paste('ancestor terms with >=', cutoff, 'occurences in all sample_ontology_entries')) | ||
# ggplot(ancestor_occurences, aes(x = counts)) + geom_histogram() + labs(title=paste('occurence distribution of terms in all sample_ontology_entries')) | ||
|
||
|
||
|
||
ancestor_occurences <- as.data.table(main_dt[rows_of_interest, sort(table(unlist(ancestors)))]) | ||
setnames(ancestor_occurences, c('V1', 'N'), c('term', 'counts')) | ||
unique_ancestor2occurences <- ancestor_occurences[, counts] | ||
names(unique_ancestor2occurences) <- ancestor_occurences[, term] | ||
|
||
main_dt[rows_of_interest, ancestors:=sapply(ancestors, function(a) list(a[order(unique_ancestor2occurences[a])]))] | ||
|
||
last_k <- 1L | ||
for (l in ls) { | ||
for (k in seq.int(last_k, sum(rows_of_interest, na.rm = TRUE))) { | ||
new_col <- | ||
main_dt[rows_of_interest, sapply(ancestors, function(ancestor_vector){ | ||
if(is.null(ancestor_vector)) return(NA) | ||
ancestor_vector[tryCatch( | ||
max(which(unique_ancestor2occurences[ancestor_vector] <= k)), | ||
warning = function(w) | ||
ifelse(w$message == 'no non-missing arguments to max; returning -Inf', 1, w) | ||
)] | ||
} | ||
)] | ||
if (l >= uniqueN(new_col)) { | ||
if (sum(rows_of_interest, na.rm = TRUE) != length(new_col)) | ||
stop('new column has different length than number of rows') | ||
main_dt[rows_of_interest, (paste('order', l, sep='_')):=new_col] | ||
last_k <- k | ||
break | ||
} | ||
} | ||
} | ||
|
||
main_dt | ||
} | ||
|
||
# now go trough the curie cols ---- | ||
metadata_dt <- copy(original_dt) | ||
|
||
ncit_cols <- names(original_dt)[endsWith(names(original_dt), '_ncit')] | ||
for (curie_col in c('sample_ontology_curie', ncit_cols)){ | ||
message(curie_col) | ||
spread_curies <- original_dt[, .(single_curie=unlist(tstrsplit(get(curie_col), separator, fixed=TRUE))), by=mget(curie_col)] | ||
if (curie_col == 'sample_ontology_curie') { | ||
spread_curies[, sample_ontology:=tstrsplit(single_curie, ':', fixed=TRUE, keep = 1)] | ||
spread_curies[sample_ontology=='CL', term:=sapply(single_curie, term, object=cl)] | ||
spread_curies[sample_ontology=='EFO', term:=sapply(single_curie, term, object=efo)] | ||
spread_curies[sample_ontology=='UBERON', term:=sapply(single_curie, term, object=uberon)] | ||
spread_curies[, sample_ontology:=as.factor(sample_ontology)] | ||
} else { | ||
spread_curies[, term:=sapply(single_curie, term, object=ncit)] | ||
} | ||
spread_curies[, term_name:=sapply(term, function(t) unname(termLabel(t)))] | ||
spread_curies[, ancestors:=lapply(term, function(t) { | ||
ancestors <- termLabel(myAncestors(t)) | ||
ancestors <- ancestors[startsWith(names(ancestors), toupper(termOntology(t)))] | ||
c(termLabel(t), ancestors) | ||
})] | ||
if (curie_col == 'sample_ontology_curie') { | ||
aggregated_ancestors <- spread_curies[, .(term_name=paste(sort(term_name), collapse = separator), ancestors=list(unique(unlist(Reduce(intersect, ancestors))))), | ||
by=.(sample_ontology, sample_ontology_curie)] | ||
metadata_dt[aggregated_ancestors, on=.(sample_ontology_curie), (c('sample_ontology', 'sample_ontology_term')):=mget(c('sample_ontology', 'term_name'))] | ||
metadata_dt[, sample_ontology:=as.factor(sample_ontology)] | ||
for (this_ontology in aggregated_ancestors[, levels(sample_ontology)]) { | ||
add_higher_order(aggregated_ancestors, rows_of_interest = aggregated_ancestors[, sample_ontology == this_ontology]) | ||
} | ||
print(aggregated_ancestors[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30)), by=sample_ontology]) | ||
} else { | ||
aggregated_ancestors <- spread_curies[, .(term_name=paste(sort(term_name), collapse = separator), ancestors=list(unique(unlist(Reduce(intersect, ancestors))))), | ||
by=mget(curie_col)] | ||
add_higher_order(aggregated_ancestors) | ||
print(aggregated_ancestors[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30))]) | ||
} | ||
cols_to_add <- names(aggregated_ancestors)[startsWith(names(aggregated_ancestors), 'order')] | ||
cure_col_term <- sub('curie', 'term', curie_col, fixed = TRUE) | ||
metadata_dt[aggregated_ancestors, on=curie_col, (c(paste(cure_col_term, cols_to_add, 'unique', sep = '_'), 'ancestors')):=mget(c(cols_to_add, 'ancestors'))] | ||
if (curie_col == 'sample_ontology_curie') { | ||
non_matching_manual <- metadata_dt[!mapply(function(term, a) term %in% a, sample_ontology_term_high_order_manual, ancestors)] | ||
if(nrow(non_matching_manual) > 0) | ||
print(non_matching_manual[, .(EpiRR, project, biomaterial_type, sample_ontology_curie, cell_type, tissue_type, line, sample_ontology_term_high_order_manual)]) | ||
for (this_ontology in metadata_dt[, levels(sample_ontology)]) { | ||
add_higher_order(metadata_dt, rows_of_interest = metadata_dt[, sample_ontology == this_ontology]) | ||
} | ||
print(metadata_dt[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30)), by=sample_ontology]) | ||
} else { | ||
if (curie_col == 'disease_ontology_curie_ncit') { | ||
non_matching_manual <- metadata_dt[!mapply(function(term, a) term %in% a, disease_intermediate_order_manual, ancestors)] | ||
if(nrow(non_matching_manual)>0){ | ||
browser() | ||
print(non_matching_manual[, .(EpiRR, project, biomaterial_type, disease, disease_ontology_curie, disease_ontology_curie_ncit, disease_intermediate_order_manual, disease_high_order_manual)]) | ||
} | ||
} | ||
add_higher_order(metadata_dt) | ||
print(metadata_dt[, .(unique_terms_higher_order=uniqueN(order_15), unique_terms_intermediate_order=uniqueN(order_30))]) | ||
} | ||
metadata_dt[, ancestors:=NULL] | ||
cols_to_rename <- names(metadata_dt)[startsWith(names(metadata_dt), 'order')] | ||
setnames(metadata_dt, cols_to_rename, paste(cure_col_term, cols_to_rename, sep = '_')) | ||
} | ||
|
||
names(metadata_dt) <- gsub('(ncit_)?order_15', 'high_order', names(metadata_dt)) | ||
names(metadata_dt) <- gsub('(ncit_)?order_30', 'intermediate_order', names(metadata_dt)) | ||
|
||
fwrite(metadata_dt, file = 'IHEC_metadata_harmonization.v0.11.extended.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
import os.path | ||
import urllib | ||
from subprocess import run | ||
|
||
import pandas as pd | ||
|
||
# make sure the working directory when running this file is the project root of the git project | ||
os.chdir('../../') | ||
|
||
# create openrefine project and apply rules - OPENREFINE SERVER HAS TO BE RUNNING | ||
# creating openrefine projects via the openrefine-client needs a csv as input in order to work properly | ||
openrefine_client = './openrefine/openrefine-client_0-3-10_linux' # path to the openrefine executable | ||
initial_csv = './openrefine/v0.10/IHEC_metadata_harmonization.v0.10.extended.csv' # csv to build project from | ||
|
||
# create project with intermediate version | ||
intermediate_project_name = os.path.splitext(os.path.basename(initial_csv))[0] | ||
# run([openrefine_client, '--delete', intermediate_project_name], check=True) | ||
run([openrefine_client, '--create', initial_csv], check=True) | ||
|
||
# here we manually solve some mapping issues and conflicts and the resulting json is then used in this script | ||
|
||
run([openrefine_client, '--apply', 'openrefine/v0.11/fixing_inconsistencies.json', intermediate_project_name], | ||
check=True) | ||
run([openrefine_client, '--apply', 'openrefine/v0.11/fix_other_and_blank.json', intermediate_project_name], | ||
check=True) | ||
|
||
v0_11_extended_intermediate_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.intermediate.csv' | ||
run([openrefine_client, '--export', f'--output={v0_11_extended_intermediate_csv}', intermediate_project_name], | ||
check=True) | ||
|
||
v0_11_extended_intermediate = pd.read_csv(v0_11_extended_intermediate_csv) | ||
epirr_all = urllib.request.urlopen('https://www.ebi.ac.uk/vg/epirr/view/all?format=json').read() | ||
epirr_all_dt = pd.read_json(epirr_all) | ||
epirr_all_dt.rename(columns={'type': 'donor_type'}, inplace=True) | ||
v0_11_extended_intermediate_merged = pd.merge( | ||
v0_11_extended_intermediate, | ||
epirr_all_dt[['full_accession', 'donor_type']], | ||
how="inner", | ||
left_on='EpiRR', | ||
right_on='full_accession', | ||
validate='one_to_one' | ||
) | ||
v0_11_extended_intermediate_merged.drop(columns='full_accession', inplace=True) | ||
|
||
assert (len(v0_11_extended_intermediate) == len(v0_11_extended_intermediate_merged)) | ||
automatic_higher_level = v0_11_extended_intermediate_merged.columns.str.endswith( | ||
'order') | v0_11_extended_intermediate_merged.columns.str.endswith('unique') | ||
v0_11_extended_intermediate_merged.loc[:, ~automatic_higher_level].to_csv(v0_11_extended_intermediate_csv, index=False) | ||
|
||
run(['Rscript', 'add_higher_order_v0.11.R'], check=True, cwd='./openrefine/v0.11') | ||
|
||
v0_11_extended_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.extended.csv' # csv to build project from | ||
v0_11_extended = pd.read_csv(v0_11_extended_csv) | ||
v0_11_extended.sort_values(by='EpiRR', inplace=True) | ||
|
||
renaming_dict = {'EpiRR': 'EpiRR', | ||
'EpiRR_status': 'EpiRR_status', | ||
'project': 'project', | ||
'biomaterial_type': 'harm_biomaterial_type', | ||
'line': 'harm_line', | ||
'markers': 'harm_markers', | ||
'cell_type': 'harm_cell_type', | ||
'tissue_type': 'harm_tissue_type', | ||
'sample_ontology_curie': 'harm_sample_ontology_curie', | ||
'disease': 'harm_disease', | ||
'disease_ontology_curie': 'harm_disease_ontology_curie', | ||
'donor_age': 'harm_donor_age', | ||
'donor_age_unit': 'harm_donor_age_unit', | ||
'donor_health_status': 'harm_donor_health_status', | ||
'donor_health_status_ontology_curie': 'harm_donor_health_status_ontology_curie', | ||
'donor_id': 'harm_donor_id', | ||
'donor_life_stage': 'harm_donor_life_stage', | ||
'health_state': 'harm_donor_life_status', | ||
'sex': 'harm_donor_sex', | ||
'sample_ontology_term_high_order_manual': 'harm_sample_ontology_intermediate', | ||
'disease_high_order_manual': 'harm_disease_high', | ||
'disease_intermediate_order_manual': 'harm_disease_intermediate', | ||
'donor_type': 'donor_type'} | ||
|
||
v0_11_extended.rename(columns=renaming_dict, inplace=True) | ||
v0_11_extended = v0_11_extended[['EpiRR', 'project', 'harm_biomaterial_type', 'harm_sample_ontology_intermediate', 'harm_disease_high', 'harm_disease_intermediate', | ||
'EpiRR_status', 'harm_cell_type', 'harm_line', 'harm_tissue_type', 'harm_sample_ontology_curie', 'harm_markers', | ||
'sample_ontology', 'sample_ontology_term', 'sample_ontology_term_high_order_JeffreyHyacinthe', 'sample_ontology_term_high_order_JonathanSteif', 'sample_ontology_term_intermediate_order_unique', 'sample_ontology_term_high_order_unique', 'sample_ontology_term_intermediate_order', 'sample_ontology_term_high_order', | ||
'harm_disease', 'harm_disease_ontology_curie', 'disease_ontology_curie_ncit', 'disease_ontology_term_intermediate_order_unique', 'disease_ontology_term_high_order_unique', 'disease_ontology_term_intermediate_order', 'disease_ontology_term_high_order', | ||
'donor_type', 'harm_donor_id', 'harm_donor_age', 'harm_donor_age_unit', 'harm_donor_life_stage', 'harm_donor_sex', | ||
'harm_donor_health_status', 'harm_donor_health_status_ontology_curie', 'donor_health_status_ontology_curie_ncit', 'donor_health_status_ontology_term_intermediate_order_unique', 'donor_health_status_ontology_term_high_order_unique', 'donor_health_status_ontology_term_intermediate_order', 'donor_health_status_ontology_term_high_order', | ||
'harm_donor_life_status']] | ||
|
||
v0_11_extended.to_csv(v0_11_extended_csv, index=False) | ||
|
||
final_csv = './openrefine/v0.11/IHEC_metadata_harmonization.v0.11.csv' | ||
|
||
v0_11_extended.loc[:, v0_11_extended.columns.isin(renaming_dict.values())].to_csv(final_csv, index=False) | ||
|
||
old = pd.read_csv(initial_csv) | ||
old.index = old.EpiRR | ||
old.sort_index(0, inplace=True) | ||
old.sort_index(1, inplace=True) | ||
new = pd.read_csv(v0_11_extended_csv) | ||
new.index = new.EpiRR | ||
new.rename(columns={v: k for k, v in renaming_dict.items()}, inplace=True) | ||
new.drop(columns=['donor_type'], inplace=True) | ||
new.sort_index(0, inplace=True) | ||
new.sort_index(1, inplace=True) | ||
|
||
diff_tbl = old.compare(new) | ||
diff_tbl.rename(columns={'self': 'v0.10', 'other': 'v0.11'}, inplace=True) | ||
diff_tbl.rename(columns={k: k + ':' + v for k, v in renaming_dict.items()}, inplace=True) | ||
diff_tbl.apply(lambda x: [x.dropna()], axis=1).to_json('openrefine/v0.11/diff_v0.10_v0.11.json', indent=True) |
Oops, something went wrong.