-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
405 lines (316 loc) · 20.8 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
'''
App made by:
Eduardo Reyes Alvarez, Ph.D.
Contact:
eduardo_reyes09@hotmail.com
DepMap release used:
23Q4
Data source:
Original Website: https://depmap.org/portal/download/all/
Note: This website does not provide the direct download links used here. Instead, when clicking
on -View full release details-, it shows some info and references to this Figshare data
repository: https://doi.org/10.25452/figshare.plus.24667905.v2
File 1: OmicsExpressionProteinCodingGenesTPMLogp1.csv (Downloaded as: DepMap_RNASeq_23Q4.csv)
File 2: Model.csv (Downloaded as: DepMap_CellInfo_23Q4.csv)
App version:
V10 (May 27, 2024): Improved the way to request input files from URLs.
'''
###################################################################################################
# Import the required libraries
import os
import time
from typing import List
import requests
import pandas as pd
import plotly.express as px
import streamlit as st
from streamlit_searchbox import st_searchbox
from io import BytesIO
###################################################################################################
# Cached function to download and/or read the required files just once
@st.cache_data(show_spinner=False)
def get_files():
# Show a status bar indicating when each step is completed
with st.status("Loading...", expanded=False) as status:
# Defined file names to be used in this script
rna_file = "DepMap_RNASeq_23Q4.csv"
cell_info_file = "DepMap_CellInfo_23Q4.csv"
# URLs for the files to download
directory1 = "https://plus.figshare.com/ndownloader/files/43347204"
directory2 = "https://plus.figshare.com/ndownloader/files/43746708"
# Check if files exist in the working directory, otherwise download them
if not (os.path.isfile(rna_file) and os.path.isfile(cell_info_file)):
status.update(label="Downloading files...")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'https://plus.figshare.com'
}
def download_file(url, local_filename):
retries = 3
for attempt in range(retries):
try:
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise HTTPError for bad responses (4xx and 5xx)
with open(local_filename, 'wb') as f:
f.write(response.content)
return True
except requests.exceptions.RequestException as e:
st.error(f'Error on attempt {attempt + 1}: {e}')
if attempt < retries - 1:
time.sleep(2 ** attempt)
else:
return False
if download_file(directory1, rna_file) and download_file(directory2, cell_info_file):
status.update(label="Files downloaded!")
else:
st.error("Failed to download files after multiple attempts.")
st.stop()
else:
status.update(label="Files found!")
# Import the csv files into dataframes
status.update(label="Importing files...")
RNA_expression = pd.read_csv(rna_file)
sample_IDs = pd.read_csv(cell_info_file)
# Sort the IDs by cell line name and get the relevant columns
status.update(label="Pre-processing files...")
sample_IDs = sample_IDs.sort_values(by=["CellLineName"])
sample_IDs = sample_IDs[sample_IDs["CellLineName"].notna()]
sample_IDs = sample_IDs.reset_index(drop=True)
cell_menu = sample_IDs[["ModelID", "CellLineName", "OncotreeLineage", "OncotreePrimaryDisease"]]
cell_menu.columns = ["Achilles ID", "Cell line", "Tissue", "Disease"]
# The first column of the RNASeq dataset has no name, and we need it transposed
RNA_expression = RNA_expression.set_index("Unnamed: 0").T
RNA_expression["Gene"] = RNA_expression.index
RNA_expression["Gene"] = RNA_expression["Gene"].str.replace(r'\s\(\d+\)$', '', regex=True)
RNA_expression = RNA_expression.reset_index(drop=True)
RNA_expression = RNA_expression.set_index("Gene")
# Create a dictionary to map "Achilles ID" to "Cell line" and replace the IDs for names in RNA_expression
id_to_cell_line = dict(zip(cell_menu["Achilles ID"], cell_menu["Cell line"]))
column_names_to_replace = RNA_expression.columns
new_column_names = []
for col in column_names_to_replace:
new_name = id_to_cell_line.get(col, col)
new_column_names.append(new_name)
RNA_expression.columns = new_column_names
RNA_expression = RNA_expression.sort_index(axis=1)
RNA_expression = RNA_expression.sort_index()
# Since not all the cell lines in the DepMap/Achilles project have RNA Seq data, remove those from both menus
cell_menu = cell_menu[cell_menu["Cell line"].isin(RNA_expression.columns)]
cell_menu_tissues = [""] + list(cell_menu["Tissue"].dropna().unique())
cell_menu_tissues.sort()
status.update(label="Ready to begin search!", state="complete", expanded=False)
return RNA_expression, cell_menu, cell_menu_tissues
###################################################################################################
# Step 1 - Set app configuration and load the required files
st.set_page_config(
page_title="Tool 001 - App by Eduardo",
page_icon=":bar_chart:",
layout="wide",
menu_items={
'Get Help': "https://github.com/EdRey05/Streamlit_projects/tree/main/001_RNA_expression_DepMap",
'Report a bug': "mailto:eduardo_reyes09@hotmail.com"})
# Check if the data has already been loaded, otherwise download and import it
if "cell_menu" not in st.session_state or st.session_state["cell_menu"] is None:
message = st.markdown('''
<div style='background-color: #0E6655; padding: 10px; border-radius: 5px; text-align: center; width: 75%; margin: auto;'>
<p style='font-size: 20px; font-weight: bold;'>THE FILES ARE BEING PREPARED... The app will launch shortly</p>
<p>This app was tailored for the RNA Seq dataset from the DepMap portal, release 23Q4.</p>
<p>The file used was OmicsExpressionProteinCodingGenesTPMLogp1.csv and gives values in log2(TPM+1) units.</p>
<p>For more information on the 23Q4 data or newer releases, consult: <a href="https://depmap.org/portal/download/all/" target="_blank">DepMap Portal</a>.</p>
<p>Tutorial: <a href="https://github.com/EdRey05/Streamlit_projects/tree/main/001_RNA_expression_DepMap" target="_blank">Instructions and Demo</a> </p>
</div>
''', unsafe_allow_html=True)
a = st.markdown('<hr style="margin-top: +10px; margin-bottom: +10px;">', unsafe_allow_html=True)
# Start downloading or importing the files
RNA_expression, cell_menu, cell_menu_tissues = get_files()
# Save heavy data variables to session state so get_files only runs one time
st.session_state["RNA_expression"] = RNA_expression
st.session_state["cell_menu"] = cell_menu
st.session_state["cell_menu_tissues"] = cell_menu_tissues
# Initialize variables in the session state to enable full widget interactivity
st.session_state["keep_cells_previous"] = []
st.session_state["keep_cells_current"] = []
st.session_state["search_string_temporal"] = ""
st.session_state["search_results_interactive"] = pd.DataFrame()
st.session_state["df_to_plot"] = pd.DataFrame()
st.session_state["displayed_df_to_plot"] = pd.DataFrame()
# Clear the initial message
message.empty()
a.empty()
st.rerun()
###################################################################################################
# Step 2 - Create app layout
st.title("Retrieve RNASeq data from the DepMap portal (23Q4)")
st.markdown('<hr style="margin-top: +5px; margin-bottom: +5px;">', unsafe_allow_html=True)
col_1_row_1, col_2_row_1 = st.columns([2, 3], gap="medium")
st.markdown('<hr style="margin-top: +10px; margin-bottom: +15px;">', unsafe_allow_html=True)
col_1_row_2, col_2_row_2, col_3_row_2, col_4_row_2, col_5_row_2, col_6_row_2 = st.columns(6, gap="small")
st.markdown('<hr style="margin-top: +5px; margin-bottom: +10px;">', unsafe_allow_html=True)
col_1_row_3, col_2_row_3 = st.columns(2, gap="medium")
###################################################################################################
# Step 3 - Create the main widgets in column 1-row 1, and one more in the same row upon interaction
# The widgets on the first row are created immediately after loading the files
with col_1_row_1:
# Buttons for the two types of cell line search available
st.radio(key="search_by", label="Search cell lines by:", options=["Name", "Tissue type"])
st.markdown('<hr style="margin-top: +10px; margin-bottom: +10px;">', unsafe_allow_html=True)
# This responds to the radio button and displays a different widget depending on the type of search chosen
if st.session_state["search_by"] == "Name":
st.text_input(key="search_string", label="Type the name of the cell line or part of it", value="")
# Generate the results df
search_results = st.session_state["cell_menu"].copy()
search_results["Keep cell line?"] = False
search_results = search_results[search_results["Cell line"].str.contains(st.session_state["search_string"], case=False, na=False)]
elif st.session_state["search_by"] == "Tissue type":
st.selectbox(key="search_string", label="Select a tissue", options=st.session_state["cell_menu_tissues"], index=0)
# Generate the results df
search_results = st.session_state["cell_menu"].copy()
search_results["Keep cell line?"] = False
search_results = search_results[search_results["Tissue"] == st.session_state["search_string"]]
st.markdown('<hr style="margin-top: +10px; margin-bottom: +10px;">', unsafe_allow_html=True)
# When the default option on Column 01 widgets is selected, just clear this column
if st.session_state["search_string"] == "":
col_2_row_1.empty()
# When the string/name searched is not found in the dataset, show recommendations and a warning
elif search_results.empty:
col_2_row_1.empty()
with col_2_row_1:
st.markdown('''
:red[No results matched the input entered] :confused:
:bulb: Try using more/less characters of the cell line name, such as SH or SY if you are looking for SH-SY5Y :bulb:
:warning: If this problem persists :warning: That cell line may not have been included in the DepMap project, yet :disappointed: :sob:
''')
# Proceed to create widgets if a valid string is entered and a results df is obtained
else:
# The widget to select search results takes all col 2-row 2 because its big and resizes itself
with col_2_row_1:
search_results_interactive = st.data_editor(data=search_results, hide_index=True, use_container_width=True)
# Checking or unchecking a box triggers the re-run of the app
# Entering a new search string/term re-runs the app and shows a different df in the editor
# Changing the search method and selecting a tissue type also re-runs the app
# Because of that, we use 3 variables to keep track of changes through the editor with the same or different search term
# Reset the list of previous selections when a new string on the same widget or the other is entered/selected
if st.session_state["search_string"] != st.session_state["search_string_temporal"]:
st.session_state["keep_cells_previous"] = []
st.session_state["search_string_temporal"] = st.session_state["search_string"]
# Get any rows (cell lines) that have been checked through the data editor
selected_cells = search_results_interactive.loc[search_results_interactive["Keep cell line?"], "Cell line"].tolist()
# Add new values to the current selections or keep the ones that were selected before, to both state variables
for name in selected_cells:
if name not in st.session_state["keep_cells_previous"]:
st.session_state["keep_cells_current"].append(name)
st.session_state["keep_cells_previous"].append(name)
# Unchecking a box makes selected_cells shorter so we remove the missing values from both state variables
for name in st.session_state["keep_cells_previous"]:
if name not in selected_cells:
st.session_state["keep_cells_current"].remove(name)
st.session_state["keep_cells_previous"].remove(name)
# The cummulative selections + de-selections are kept and sorted
st.session_state["keep_cells_current"] = sorted(set(st.session_state["keep_cells_current"]))
# A widget is displayed to visually track all user selections (loads empty immediately with other widgets in this col-row)
with col_1_row_1:
multiselect_options = [""]+st.session_state["keep_cells_current"]
st.multiselect(key="keep_cells_final", label="Your selections:", options=multiselect_options, placeholder="",
default=st.session_state.get("keep_cells_current", []))
###################################################################################################
# Step 4 - Extract selected cell lines from the RNA dataset and prepare to preview and download when the user decides to
# Function to save the extracted dataframe to Excel
def save_to_excel(dataframe, filename='RNA_Results.xlsx'):
output = BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
dataframe.to_excel(writer, index=True, sheet_name='Search_01')
output.seek(0)
return output
###################
# Button to trigger excel file preparation with the function above
with col_3_row_2:
preview_button = st.button(label="Preview results", type="primary")
if preview_button and st.session_state["keep_cells_final"]:
# Clean previous gene selections and plots
col_2_row_3.empty()
col_1_row_3.empty()
# Find the current cummulative selections in the pre-processed RNA df
st.session_state["extracted_RNA_data"] = st.session_state["RNA_expression"].loc[:, st.session_state["keep_cells_final"]]
st.session_state["extracted_RNA_data"] = st.session_state["extracted_RNA_data"].reset_index(drop=False)
# Prepare the data for preliminary plots
st.session_state["gene_list"] = tuple(st.session_state["extracted_RNA_data"]["Gene"].tolist())
st.session_state["df_to_plot"] = st.session_state["extracted_RNA_data"].copy()
st.session_state["df_to_plot"].insert(0, "Plot?", False)
# Call the function to convert the results df to a xlsx file
st.session_state["excel_data"] = save_to_excel(st.session_state["extracted_RNA_data"])
# Display a button to download the results when a file has been made
if "excel_data" in st.session_state:
with col_4_row_2:
st.download_button(label="Download dataset", data=st.session_state["excel_data"], file_name='RNA_Results.xlsx', type="primary")
###################################################################################################
# Step 5 - Show a preview of the results df and a tool to plot gene expression
# Function to search through the genes for the preliminary plots
def search_genes(searchterm: str) -> List[tuple[str, str]]:
# Assuming st.session_state["gene_list"] is a list of gene names
suggestions = [gene for gene in st.session_state["gene_list"] if searchterm.lower() in gene.lower()]
# Returning a list of tuples where each tuple contains a label and a value
return [(gene, gene) for gene in suggestions]
###################
# Function to make plots based on genes selected by the searchbox and/or data editor
def gene_plotter():
# Get any genes currently selected in the data editor
plot_genes = st.session_state["displayed_df_to_plot"].loc[st.session_state["displayed_df_to_plot"]["Plot?"], "Gene"].tolist()
if plot_genes:
# Plot the genes of interest and allow to swap the grouping type
with col_2_row_3:
st.radio(key="plot_type", label="Plot type", options=["Bar chart", "Heatmap"])
st.toggle(key="group_by", label="Swap group by")
st.markdown('<hr style="margin-top: +10px; margin-bottom: +10px;">', unsafe_allow_html=True)
# Make one of two possible bar charts
if st.session_state["plot_type"] == "Bar chart":
if st.session_state["group_by"]:
fig = px.bar(st.session_state["extracted_RNA_data"][st.session_state["extracted_RNA_data"]['Gene'].isin(plot_genes)],
x='Gene', y=st.session_state["extracted_RNA_data"].columns[1:], barmode='group',
color_discrete_sequence=px.colors.qualitative.Dark2)
# Customize the appearance of the bars
fig.update_traces(marker=dict(line=dict(color='black', width=0.5)), selector=dict(type='bar'))
fig.update_layout(xaxis_title="Gene", yaxis_title="log2(TPM+1)", legend_title="Cell Line", font=dict(size=24))
else:
fig = px.bar(st.session_state["extracted_RNA_data"][st.session_state["extracted_RNA_data"]['Gene'].isin(plot_genes)].set_index('Gene').T,
barmode='group', color_discrete_sequence=px.colors.qualitative.G10)
# Customize the appearance of the bars
fig.update_traces(marker=dict(line=dict(color='black', width=0.5)), selector=dict(type='bar'))
fig.update_layout(xaxis_title="Cell Line", yaxis_title="log2(TPM+1)", legend_title="Gene", font=dict(size=24))
else:
# Make a heatmap
heatmap_data = st.session_state["extracted_RNA_data"][st.session_state["extracted_RNA_data"]['Gene'].isin(plot_genes)].set_index('Gene').T
# Switch rows and columns based on the toggle state
if st.session_state["group_by"]:
heatmap_data = heatmap_data.T
# Customize the appearance of the heatmap
fig = px.imshow(heatmap_data, color_continuous_scale='Cividis')
fig.update_layout(xaxis_title="Cell Line" if st.session_state["group_by"] else "Gene",
yaxis_title="Gene" if st.session_state["group_by"] else "Cell Line",
font=dict(size=24))
# Display plot
fig.update_layout(height=400, width=600)
st.plotly_chart(fig)
else:
# Clear plots if no genes are currently selected
col_2_row_3.empty()
###################
# Proceed with results visualization only when the data has been extracted
if not st.session_state["df_to_plot"].empty:
# Show a searchbox to quickly find genes as the df has thousands of rows (the user can scroll and check boxes too)
with col_1_row_3:
st_searchbox(key="selected_gene", search_function=search_genes, default=None,
label="Type a gene name here or check/uncheck boxes below", clear_on_submit=True,)
st.markdown('<hr style="margin-top: +10px; margin-bottom: +10px;">', unsafe_allow_html=True)
# When the user selects a gene name, automatically check it to display it at the top of the df
if st.session_state["selected_gene"].get("result"):
selected_gene_result = st.session_state["selected_gene"].get("result")
st.session_state["df_to_plot"].loc[st.session_state["df_to_plot"]["Gene"] == selected_gene_result, "Plot?"] = True
# Reset the searchbox so we dont keep the previous result when the df changes
st.session_state["selected_gene"] = {"result": None, "search": "", "options_js": [], "key_react": "A"}
# Show the results df
st.session_state["sorted_df"] = st.session_state["df_to_plot"].sort_values(by=["Plot?", "Gene"], ascending=[False, True])
with col_1_row_3:
st.session_state["displayed_df_to_plot"] = st.data_editor(data=st.session_state["sorted_df"], use_container_width=True, hide_index=True)
# Call the function to plot the selected genes (if any)
gene_plotter()
###################################################################################################