-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathidenticalProSequenceChecker.py
139 lines (116 loc) · 5.6 KB
/
identicalProSequenceChecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
## author: maria f. palafox
# demonstration of script at identical_proSequence_checker.html
from maplib import *
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("EnsemblFasta", help="This is the Ensembl release-specific fasta filename, example: Homo_sapiens.GRCh38.pep.all.fa")
parser.add_argument("StableIDkey", help="This is the Ensembl-UniProt stable ID cross-reference filename from the same release as the Ensembl fasta file provided, example: Homo_sapiens.GRCh38.94.uniprot.tsv")
parser.add_argument("EnsemblRelease", help="This is the name of specific Ensembl release (ex. v94) that match the fasta and xref file")
parser.add_argument("UniprotFasta", help="This is the UniProtKB fasta filename")
parser.add_argument("UniprotDatabase", help="This is the UniProt database name, options are 'SP' for SWISSPROT or 'SPTREMBL' for SWISSPROT & TREMBL")
args = parser.parse_args()
ensemblFasta = args.EnsemblFasta
xrefile = args.StableIDkey
release = args.EnsemblRelease
ukbfa = args.UniprotFasta
UKBdb = args.UniprotDatabase
outfileName = release + "CheckedProSeq_ENSPtoUKB.csv"
def identicalSequenceCheck(dfref, dfalt):
# Pmap_protein_sequence_comparison.py
# dfref must have 'ID' and 'proSequence' colnames
# dfalt must have 'xref' and 'proSequence' colnames
ref_dic = dict(zip(dfref.ID, dfref.proSequence)) # from uniprot fasta
newcol = []
for index, row in dfalt.iterrows(): # ensembl df
pep = row['proSequence']
ukb_id = row['xref'] # colname
# retrieve uniprot fasta seq using dictionary
pepRef = ref_dic[ukb_id]
str(pepRef)
str(pep)
if pepRef == pep:
newcol.append('True')
if pepRef != pep:
newcol.append('False')
# add identity score list as new column
dfalt.loc[:,'identical2UKBfasta'] = newcol
print(dfalt.shape)
checkColumnValues(dfalt, "identical2UKBfasta")
print("dropping non identical protein seq")
dfalt = dfalt[dfalt['identical2UKBfasta'] == "True"].copy()
print(dfalt.shape)
dfalt.reset_index(inplace=True, drop=True)
return dfalt
def main():
print("*** Importing Ensembl fasta ***")
enspDF = ENSPfasta2DF(ensemblFasta, release)
print("Shape of Ensembl fasta dataframe: ",enspDF.shape)
print("*** Importing UniProt fasta ***")
ukbDF = uniprotkbFasta2csv(ukbfa)
print("Shape of UniProt fasta dataframe: ",ukbDF.shape)
listUKB = list(ukbDF['ID'])
xref = pd.read_table(xrefile)
if UKBdb == "SP":
xref = xref[xref['db_name']== 'Uniprot/SWISSPROT'].copy()
print("Xref SP UniProt IDs kept")
if UKBdb == "SPTREMBL":
print("Xref SP and TREMBL UniProt IDs kept")
## filter ensembl xref file by ukbIDs list
print("*** filtering Ensembl xref file by UniProt fasta stable IDs ***")
xref = addcolumnconditionalDrop(listUKB, xref, 'xref', 'xrefInUKBfasta')
## filter ensembl fasta for ENSP in xref file (filtered by uniprot fasta)
xref = xref[['protein_stable_id', 'xref']].copy()
xref.columns = ['ENSP', 'xref']
listENSP = list(set(xref['ENSP']))
print("*** filtering Ensembl peptide df by stable ENSP IDs from xref "
"file ***")
enspDF = addcolumnconditionalDrop(listENSP, enspDF, 'ENSP', 'enspInXref')
## merge xref file (with ENSP & UKB ID) with Ensembl fasta (with ENSP, ENST, ENSG, proSeq...)
print("*** combining Ensembl xref (ENSP - UKBID cross-ref) with Ensembl "
"fasta DF (contains Ensembl biotype IDs both stable and versioned) ")
mergeENSP = pd.merge(xref, enspDF, on=['ENSP'])
## check ensembl pro sequence identity against canonical ukb seq from fasta
print("*** identify ensembl protein sequences identical to UniProt "
"reference ***")
finaldf = identicalSequenceCheck(ukbDF, mergeENSP)
# change colnames
if release == "v85":
finaldf.columns = ['ENSP', 'UKBID', 'ENSPv',
'Length', 'proSequence',
'ENST', 'ENSTv','ENSG', 'ENSGv', 'Assembly', 'chromosome',
'start', 'stop', 'enspInXref', 'identical2UKBfasta']
# simplify
keepme = ['ENSP', 'UKBID', 'ENSPv', 'Length', 'proSequence',
'ENST', 'ENSTv','ENSG', 'ENSGv', 'Assembly', 'chromosome',
'start', 'stop']
finaldf = finaldf[keepme].copy()
else:
finaldf.columns = ['ENSP', 'UKBID', 'ENSPv', 'Length', 'proSequence',
'ENST', 'ENSTv','ENSG', 'ENSGv', 'Assembly', 'chromosome',
'start', 'stop','HGNCsymbol', 'HGNCdescription', 'enspInXref',
'identical2UKBfasta']
# simplify
keepme = ['ENSP', 'UKBID', 'ENSPv', 'Length', 'proSequence',
'ENST', 'ENSTv','ENSG', 'ENSGv', 'Assembly', 'chromosome',
'start', 'stop','HGNCsymbol', 'HGNCdescription']
finaldf = finaldf[keepme].copy()
# removing rows that dont have std chrom coordinate from Ensembl fasta file
chrlist = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y']
print("Dropping rows with nonstandard chromosome names")
finaldf= finaldf[finaldf['chromosome'].isin(chrlist)]
print("********** Summary of saved script output **********")
checkColumnValues(finaldf, 'chromosome')
print("Ensembl Protein IDs")
uniqueCount(finaldf, 'ENSP')
print("Ensembl Transcript IDs")
uniqueCount(finaldf, 'ENST')
print("Ensembl Gene IDs")
uniqueCount(finaldf, 'ENSG')
print("UniProt IDs")
uniqueCount(finaldf, 'UKBID')
# save df
finaldf.to_csv(outfileName, index=False)
if __name__ == '__main__':
main()