-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathIDEA.py
142 lines (120 loc) · 5.03 KB
/
IDEA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from __future__ import print_function
import argparse
import pandas as pd
import sys
import random
import requests
import yaml
import urllib.request
import urllib.error
import urllib.parse
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
with open(args.filename) as file:
config = yaml.full_load(file)
input_path = config['DESeq_input']['path']
# TODO: fix static thresholding, allow for variable
baseMean = config['baseMean']
log2FoldChange = config['log2FoldChange']
lfcSE = config['lfcSE']
pvalue = config['pvalue']
padj = config['padj']
species_id = config['species']
cs_threshold = config['cs_threshold']
df = pd.read_csv(input_path)
# use threshold value to cut down CSV
# only columns defined in config.yaml file
df_threshold = df.loc[(df['baseMean'] > baseMean)
& (df['log2FoldChange'] < log2FoldChange)
& (df['lfcSE'] < lfcSE)
& (df['pvalue'] < pvalue)
& (df['padj'] < padj)]
my_genes = df_threshold['genes']
# TODO: Use Dataframe.append to add to gene list
def mapId():
mapped_Id = []
string_api_url = "https://string-db.org/api"
output_format = "tsv"
method = "get_string_ids"
# configure parameters
params = {
"identifiers": "\r".join(my_genes),
"species": species_id,
"limit": 1,
"echo_query": 1,
"caller_identity": "mdibl.org"
}
request_url = "/".join([string_api_url, output_format, method])
results = requests.post(request_url, data=params)
for line in results.text.strip().split("\n"):
l = line.split("\t")
input_identifier, string_identifier = l[0], l[2]
print("Input:", input_identifier, "STRING:",
string_identifier, sep="\t")
mapped_Id.append(line)
#print(mapped_Id)
return line
# returning unsorted list, need to extract only strings that start with ENSDARP*
mapId()
# function to pass in the appropriately converted string protein ID?
# strip out species ID
# format to pass in for the networkInteractions function
# def prepareId():
# for each protein in a given list, print protein-protein interactions
# with medium medium or higher confidence exp score
# retrieves only the interactions between the set of input proteins
# and between their closest interaction neighborhood
def networkInteraction():
string_api_url = "https://string-db.org/api"
output_format = "tsv-no-header"
method = "network"
request_url = "/".join([string_api_url, output_format, method])
params = {
"identifiers": "%0d".join(my_genes), # your protein
"species": species_id, # species NCBI identifier
"caller_identity": "mdibl.org" # your app name
}
# read and parse results
response = requests.post(request_url, data=params)
for line in response.text.strip().split("\n"):
l = line.strip().split("\t")
p0, p1, p2, p3, p4 = l[0], l[1], l[2], l[3], l[4]
experimental_score = float(l[10])
if experimental_score != 0:
print("\t".join(
[p0, p1, p2, p3, p4, "experimentally confirmed (prob. %.3f)" % experimental_score]))
# provides the interactions between your set of proteins and all the other STRING proteins
# for each protein in a given list, print name of best interaction partner(s)
# dictated by "limit"
def interactionPartners():
string_api_url = "https://string-db.org/api"
output_format = "tsv-no-header"
method = "interaction_partners"
request_url = "/".join([string_api_url, output_format, method])
params = {
"identifiers": "%0d".join(my_genes), # your protein
"species": species_id, # species NCBI identifier
"limit": 1,
"caller_identity": "mdibl.org" # your app name
}
response = requests.post(request_url, data=params)
for line in response.text.strip().split("\n"):
l = line.strip().split("\t")
query_ensp = l[0]
query_name = l[2]
partner_ensp = l[1]
partner_name = l[3]
combined_score = l[5]
print("\t".join([query_ensp, query_name,
partner_ensp, partner_name, combined_score]))
# TODO flesh out main method
#if __name__ == "__main__":
#mapId()
#networkInteraction()
#interactionPartners()
#print("after __name__ guard")
# open and read based on secondary threshold
# slice based upon names that pass primary threshhold
# split passing and failing into separate tables
# remove duplicates?