-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_species_specific_orthogroups.py
122 lines (99 loc) · 4.04 KB
/
extract_species_specific_orthogroups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
import os
import sys
import re
import csv
import argparse
import glob
from pathlib import Path
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--prefix', nargs='+', help='prefix describing the species to extract')
parser.add_argument('-i', '--input', help='path to Orthogroups.tsv file (default = "Orthogroups.tsv" in the current directory)', default = 'Orthogroups.tsv')
parser.add_argument('-o', '--output', help='path to location for output files (default = current directory)')
parser.add_argument('-f', '--fasta', help = 'path to folder containing fasta files containing the protein sequences used to run Orthofinder (defaul = files are in current directory)')
args = parser.parse_args()
if args.input:
orthogroups_file = Path(args.input)
if args.output:
output_path = Path(os.path.abspath(args.output))
else:
output_path = Path(os.path.join(os.getcwd(),'output'))
if args.fasta:
fasta_files = Path(args.fasta)
else:
fasta_files = Path(os.getcwd())
fastas = []
for file in fasta_files.glob('*.fasta'):
fastas.append(file)
if args.prefix:
prefix = args.prefix
else:
print('ERROR - please provide at least one protein name prefix')
sys.exit()
orthogroups = {}
species_specific_orthogroups = {}
proteins = {}
# Create output folder
try:
os.makedirs(output_path)
except Exception:
pass
# Overwrite files in output folder if any exist
for file in os.listdir(output_path):
file_path = os.path.join(output_path, file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception as e:
print(e)
# Read Orthogroups.tsv file
if not os.path.isfile(orthogroups_file):
print('ERROR - missing:', orthogroups_file)
sys.exit()
with open(orthogroups_file) as file:
print('=======================')
print('Reading Orthogroups.tsv')
input = csv.reader(file, delimiter='\t')
next(input, None) #skip header line
for line in input:
id = line[0]
orthogroups[id] = line[1:]
# Parse fasta files
print('Parsing fasta files')
print('=======================')
for fasta in fastas:
with open(fasta) as file:
input = file.read().splitlines()
for line in input:
if not line.strip(): continue
if line[0] == '>':
name = line[1:]
proteins[name] = ''
else:
proteins[name] += line
#for protein in proteins:
#
# Check if the orthogroups contain proteins from all the provided prefixes.
# If an orthogroup contains only members of a single prefix then add that
# protein name to the 'species_specific_orthogroups' dictionary
for element in orthogroups: #element = orthogroup number
counter = len(prefix)
for p in prefix: # p = one of the prefixes
for x in orthogroups[element]: # x = current element of orthogroup
if p in x: # check if the element contains the prefix
counter -= 1 # if it does, decrease the counter by 1
if counter == 1: # if counter == 1 after the loop, the orthogroup contains only species specific elements
for x in orthogroups[element]:
if x != '': # if the orthogroup does not contain one of the prefixes, it contains empty list elements which we want to ignore
temp = (x.split(", ")) # if there are several members from the same species, they are separated by a comma and a space
for t in temp:
if element in species_specific_orthogroups:
species_specific_orthogroups[element].append(t)
else:
species_specific_orthogroups[element] = [t]
print('There are',len(species_specific_orthogroups),'species specific orthogroups')
print('Extracting species specific orthogroups to',output_path)
for x in prefix:
for orthogroupID in species_specific_orthogroups:
for protein in species_specific_orthogroups[orthogroupID]:
print('>'+protein, '\n'+proteins[protein], file=open(os.path.join(output_path, orthogroupID+'.fasta'),'a'))