-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript_add_table_to_database.py
56 lines (42 loc) · 2.04 KB
/
script_add_table_to_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#####################################
###### ABOUT THIS SCRIPT #
#####################################
# This script reads a txt/csv file and imports it as a table in a database
# It also merges two tables on a common column, and writes the resulting merge as a new table in the database
#####################################
###### INSTRUCTIONS FOR USE: #
#####################################
#In the shell go to this directory and type 'python script_add_table_to_database.py' to run the script.
#####################################
# Import necessary libraries
import numpy
import pandas
from pandas.io import sql
import sqlite3 as lite
# Open connection to the geneSequenceResults.db
con = lite.connect('geneSequenceResults.db');
cur = con.cursor()
# Read in the file as a dataframe
patients_table = pandas.read_csv('nationwidechildrens.org_clinical_patient_chol.txt', sep='\t')
# This particular file needs to be clean (the first two rows are header descriptions, not actual data
if patients_table.columns[0] == patients_table.iloc[0][patients_table.columns[0]]:
patients_table = patients_table.shift(-2)
# Write the dataframe to the database as a table
cur.execute('DROP TABLE IF EXISTS patients')
sql.to_sql(patients_table, name='patients', con=con)
# Read in database table as a dataframe
gene_pairs = pandas.read_sql_query('select * from gene_pairs', con)
# Merge the two dataframes on a common column
patients_table.rename(columns={'bcr_patient_barcode':'sample'}, inplace=True)
merged = pandas.merge(gene_pairs, patients_table, on=['sample'], how='inner')
gene_pairs_patient = merged[ numpy.concatenate([gene_pairs.columns, ['tumor_grade']]) ]
# Write the resulting dataframe to the database as a table
cur.execute('DROP TABLE IF EXISTS gene_pairs_patient')
sql.to_sql(gene_pairs_patient, name='gene_pairs_patient', con=con)
# Commit to the changes and close connection to geneSequenceResults.db
con.commit()
con.close()
print("...successfully completed !")
#####################################
# END OF FILE
#####################################