-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataPreprocessing.Rmd
136 lines (121 loc) · 5.95 KB
/
dataPreprocessing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "dataPreparation"
author:
- "ddedesener"
- "DeniseSl22"
date: "05/02/22"
output:
md_document:
variant: markdown_github
always_allow_html: true
---
## Introduction
In this section of the workflow, we will obtain the metabolomics data and apply filtering options, to create a dataset ready for further statistical and pathway analysis.
## First, we setup the required libraries to get started.
```{r setup, warning=FALSE, message=FALSE}
# check if libraries are already installed > otherwise install it
if(!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager",repos = "http://cran.us.r-project.org")
if(!"dplyr" %in% installed.packages()) BiocManager::install("dplyr")
if(!"stringr" %in% installed.packages()){install.packages("stringr")}
#load libraries
library(dplyr)
library(stringr)
```
## Second, we download the required data, read the metadata and filter out not-relevant data.
```{r read_data, warning=FALSE, message=FALSE}
#Library to download data from online files:
if(!"downloader" %in% installed.packages()){install.packages("downloader")}
require(downloader)
##Download metadata, extract metabolomics sample IDs, location and disorders.
if(file.exists("data/hmp2_metadata.csv")){print("Metadata already downloaded")}else{
fileUrl <- "https://ibdmdb.org/tunnel/products/HMP2/Metadata/hmp2_metadata.csv?accessType=DOWNLOAD"
require(downloader)
download(fileUrl, "data/hmp2_metadata.csv", mode = "wb")
}
#read metadata file
metaData <- read.csv("data/hmp2_metadata.csv")
#filter out by data type and week number
metaDataMBX <- subset(metaData, metaData$data_type == "metabolomics" )
#we need to have the samples which has same visit number
metaDataMBX<- subset(metaDataMBX, metaDataMBX$visit_num == 4)
#we should match transcriptomics (htx) samples and metabolomics (mbx) samples with participantID
#but samples are given by their externalID in mbx file so we should keep them both
#select columns which will be used
metaDataMBX <- metaDataMBX %>% dplyr::select(External.ID,Participant.ID,diagnosis)
#rename columns of metaDataMBX
colnames(metaDataMBX) <- c("ExternalID","ParticipantID","disease" )
#download and read metabolomics peak intensity data
if(file.exists("data/metabolomics.csv.gz")){print("Metabolomics zipped data already downloaded")}else{
fileUrl <- "https://ibdmdb.org/tunnel/products/HMP2/Metabolites/1723/HMP2_metabolomics.csv.gz?accessType=DOWNLOAD"
download(fileUrl, "data/metabolomics.csv.gz", mode = "wb")
}
#Note: if the URL download does not work, the zipped file is located on GitHub to continue the rest of this script.
if(file.exists("data/metabolomics.csv")){print("Unzipped Metabolomics data already downloaded")}else{
if(!"R.utils" %in% installed.packages()){install.packages("R.utils")}
library(R.utils)
gunzip("data/metabolomics.csv.gz", remove=FALSE)
}
mbxData <- read.csv("data/metabolomics.csv")
#delete not used columns
mbxData = subset(mbxData, select = -c(1,2,3,4,7) )
```
## Third, we perform data extraction, and process the data
```{r filtering,warning=FALSE, message=FALSE}
### row (metabolite) filtering ###
#delete metabolite or row if it has NA or empty value for hmdbID
mbxData<- mbxData[!(is.na(mbxData$HMDB...Representative.ID.) | mbxData$HMDB...Representative.ID.=="") , ]
#remove rows which has hmdb as "redundant ion"
mbxData<- mbxData[!(mbxData$HMDB...Representative.ID.=="redundant ion") , ]
#remove character (asterisk) in some hmdb column values
mbxData$HMDB...Representative.ID.<- stringr::str_replace(mbxData$HMDB...Representative.ID., '\\*', '')
#Update HMDB IDs to new data structure
mbxData$HMDB...Representative.ID.<- stringr::str_replace(mbxData$HMDB...Representative.ID., 'HMDB', 'HMDB00')
#back up original mbxdata
mbxData.b <- mbxData
### modify mbxData based on sample names given in metaData file (created with the criteria visit_num=4 )###
#filter out mbxData columns (samples) based metaDataMBX externalIDs
names.use <- names(mbxData)[ names(mbxData) %in% metaDataMBX$ExternalID]
#update mbx data with used names
mbxData <- mbxData [ ,names.use]
#order data based on col names
mbxData <- mbxData[ , order(names(mbxData))]
#order metadata based on externalID
metaDataMBX <- metaDataMBX[order(metaDataMBX$ExternalID),]
#add HMDBID and Compound Name column to the mbx data
mbxData <- cbind(mbxData.b$HMDB...Representative.ID., mbxData.b$Metabolite,mbxData)
colnames(mbxData)[1] <- "HMDB.ID"
colnames(mbxData)[2] <- "Compound.Name"
#add disease labels to the mbx data
diseaseLabels <- metaDataMBX$disease
##Add two empty strings to match with additional column data.
diseaseLabels <- append(diseaseLabels, "NA",after = 0)
diseaseLabels <- append(diseaseLabels, "NA",after = 0)
mbxData <- rbind(diseaseLabels, mbxData)
```
## Fourth, we split up the data for UC and CD, include the control data nonIBD, and save this data to an output folder.
```{r writing_to_file,warning=FALSE, message=FALSE }
#write only UC versus nonIBD comparison
mbxDataUC <- mbxData[ ,(mbxData[1, ] == "UC" | mbxData[1, ] == "nonIBD")]
#add hmdb id again
mbxDataUC <- cbind(mbxData[,1:2],mbxDataUC)
colnames(mbxDataUC)[1]="HMBDB.ID"
colnames(mbxDataUC)[2] <- "Compound.Name"
write.table(mbxDataUC, "output/mbxDataUC_nonIBD.csv", sep =",", row.names = FALSE)
#write only CD_healthy comparison
mbxDataCD <- mbxData[ ,(mbxData[1, ] == "CD" | mbxData[1, ] == "nonIBD")]
mbxDataCD <- cbind(mbxData[,1:2],mbxDataCD)
colnames(mbxDataCD)[1]="HMBDB.ID"
colnames(mbxDataCD)[2] <- "Compound.Name"
write.table(mbxDataCD, "output/mbxDataCD_nonIBD.csv", sep =",", row.names = FALSE)
```
### Last, we create a Jupyter notebook and markdown file from this script
```{r writing_to_notebooks,warning=FALSE, message=FALSE }
#Jupyter Notebook file
if(!"devtools" %in% installed.packages()) BiocManager::install("devtools")
devtools::install_github("mkearney/rmd2jupyter", force=TRUE)
library(devtools)
library(rmd2jupyter)
rmd2jupyter("dataPreprocessing.Rmd")
##Clean up R-studio environment
remove(diseaseLabels, fileUrl, names.use, mbxData, mbxData.b, mbxDataCD, mbxDataUC, metaDataMBX, metaData)
```