-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprepCh20Data.py
59 lines (48 loc) · 2.06 KB
/
prepCh20Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import pandas as p
# Read file obtained via custom download from ensembl biomart
# Custom download:
# Filters: Chromosome/scaffold: 20
# With HGNC Symbol ID(s): Only
# Attrib.: Gene stable ID
# Gene start (bp)
# Gene end (bp)
# Strand
# GOSlim GOA Accession(s)
# GOSlim GOA Description
# HGNC symbol
def prepData():
#load the biomart data
rawFile = os.path.abspath("mart_export.txt")
rawData = p.read_table(rawFile, header=0)
#don't need the gene ID column but we do want a chromosome column. We can mutate this in place.
rawData["Gene stable ID"] = 20
#rename columns
rawData = rawData.rename(columns={
"Gene stable ID": "chr",
"Gene start (bp)": "start",
"Gene end (bp)": "end",
"Strand": "strand",
"GOSlim GOA Accession(s)": "GOAid",
"GOSlim GOA Description": "GOAdescr",
"HGNC symbol": "sym"})
#set sym to be the index and remove the sym col
rawData = rawData.set_index(rawData.sym)
rawData = rawData.drop("sym", axis=1)
#make GOA frequency table
GOAfreq = rawData.GOAid.value_counts()
#remove duplicate rows
rawData = rawData.drop_duplicates()
#choose the most specific (least frequent) GOA
for symbol in rawData.index.unique():
#if there are multiple, select row with most specific GOA
if p.Series(rawData.loc[symbol].GOAid).count() > 1:
topGOA = GOAfreq[rawData.loc[symbol].GOAid].idxmin()
#drop all other rows for that symbol -> keep all those that are not the symbol of interest OR have the topGOA
rawData = rawData[((rawData.index != symbol) | (rawData.GOAid == topGOA))]
#sort output by HUGO symbol
rawData = rawData.sort_index()
#write to tsv
rawData.to_csv("chr20_data.tsv", sep="\t")
if __name__ == '__main__':
prepData()