Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TAD_pathways #19

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
f0b585d
Add files via upload
Colin-Cheng Jul 5, 2018
a28cf0d
Delete generate_index_files.py
Colin-Cheng Jul 5, 2018
4007048
Add files via upload
Colin-Cheng Jul 5, 2018
624cea3
Add files via upload
Colin-Cheng Jul 6, 2018
6003eba
add the TAD_boundary argument
Colin-Cheng Jul 6, 2018
ad0eeec
Update build_custom_tad_genelist.py
Colin-Cheng Jul 6, 2018
38caa54
Update build_custom_tad_genelist.py
Colin-Cheng Jul 11, 2018
19c6d5f
update data directory
Colin-Cheng Jul 12, 2018
9dc1f02
add tables directory
Colin-Cheng Jul 12, 2018
74b0476
Delete example_pipeline_bmd.sh
Colin-Cheng Jul 12, 2018
4c2fa01
Delete example_pipeline_t2d.sh
Colin-Cheng Jul 12, 2018
3e536d2
Delete bmd_gene_evidence.csv
Colin-Cheng Jul 12, 2018
c76577b
Delete bmd_gene_evidence_summary.tsv
Colin-Cheng Jul 12, 2018
e3655f9
Delete t2d_gene_evidence.csv
Colin-Cheng Jul 12, 2018
bc5b510
Delete t2d_gene_evidence_summary.tsv
Colin-Cheng Jul 12, 2018
178411e
Delete venn_bmd.tiff
Colin-Cheng Jul 12, 2018
2e9267e
Delete venn_t2d.tiff
Colin-Cheng Jul 12, 2018
dcb14b5
Delete bmd_LD_complete_gestalt.tsv
Colin-Cheng Jul 12, 2018
9ea7284
Delete bmd_LD_gestalt.tsv
Colin-Cheng Jul 12, 2018
aceee43
Delete bmd_LD_pvalues.tsv
Colin-Cheng Jul 12, 2018
c47a888
Delete bmd_complete_gestalt.tsv
Colin-Cheng Jul 12, 2018
fa036a8
Delete bmd_gestalt.tsv
Colin-Cheng Jul 12, 2018
4711a4a
Delete bmd_nearest_gene_complete_gestalt.tsv
Colin-Cheng Jul 12, 2018
2830904
Delete bmd_nearest_gene_gestalt.tsv
Colin-Cheng Jul 12, 2018
5a46c16
Delete bmd_nearest_gene_pvalues.tsv
Colin-Cheng Jul 12, 2018
5eab148
Delete bmd_pvalues.tsv
Colin-Cheng Jul 12, 2018
091deea
Delete t2d_complete_gestalt.tsv
Colin-Cheng Jul 12, 2018
f91fbff
Delete t2d_gestalt.tsv
Colin-Cheng Jul 12, 2018
591b08f
Delete t2d_pvalues.tsv
Colin-Cheng Jul 12, 2018
74f7924
add index directory
Colin-Cheng Jul 13, 2018
5561a9d
Add output files from visualize
Colin-Cheng Jul 13, 2018
e19aad4
add output form generate_index_files.py
Colin-Cheng Jul 13, 2018
c109aa0
Add files via upload
Colin-Cheng Jul 13, 2018
6c165c6
Add output file from visualize
Colin-Cheng Jul 13, 2018
f225acb
add generate_index_file and visualize
Colin-Cheng Jul 13, 2018
8a53ef1
Update build_custom_tad_genelist.py
Colin-Cheng Jul 13, 2018
cc38837
add run_pipeline.sh
Colin-Cheng Jul 17, 2018
101e8c8
Update README.md
Colin-Cheng Jul 17, 2018
74c353a
Update README.md
Colin-Cheng Jul 17, 2018
445da96
Add files via upload
Colin-Cheng Jul 17, 2018
250f701
Update README.md
Colin-Cheng Jul 17, 2018
2a76ac1
Update README.md
Colin-Cheng Jul 17, 2018
285ed4d
Update README.md
Colin-Cheng Jul 17, 2018
8c8fc74
Update README.md
Colin-Cheng Jul 17, 2018
9103347
Update README.md
Colin-Cheng Jul 18, 2018
eca3507
Update build_custom_tad_genelist.py
Colin-Cheng Dec 18, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add files via upload
  • Loading branch information
Colin-Cheng authored Jul 13, 2018
commit c109aa062e8e5c9c8ac8476924f1d62ffc7aa89c
234 changes: 234 additions & 0 deletions scripts/visualize_gc_and_divergence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove from PR

2016 Gregory Way
scripts/visualize_gc_and_divergence.py

Description:
Observe GC Content distributions and locations across TADs

Usage:
Is called by 'scripts/visualize.sh' which is run inside of
'scripts/run_pipeline.sh':

python gc_content_distribution.py --TAD-Boundary 'hESC'

Output:
GC Content distribution and histogram across TADs as a .pdf file
"""

import os
import random
import argparse

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO

from tad_util.util import assign_bin, load_tad

plt.figure.max_open_warning = 0
sns.set_style("whitegrid")
sns.set_style("ticks")
sns.set_context("paper", rc={"font.size": 20, "axes.titlesize": 20,
"axes.labelsize": 20, "xtick.labelsize": 12,
"ytick.labelsize": 12})
random.seed(123)

# Load Command Arguments
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--TAD-Boundary', help='boundary cell type.')
parser.add_argument('-f', '--TAD-File', help='path to 3-column-tab-separated TAD domain bed file.')
args = parser.parse_args()

# Define Constants
num_bins = 50
xlab = [''] * num_bins
for x in range(0, 50, 10):
xlab[x] = x
tad_cell = args.TAD_Boundary

# Generate file names depending on input
genome = 'hg19'
base_dir = 'data/'

base_file = '{}_{}'.format(genome, tad_cell)
repeat_index = os.path.join('index',
'REPEATS_index_{}.tsv.bz2'.format(base_file))
gc_fig_file = os.path.join('figures',
'gc_distribution_{}.pdf'.format(base_file))
div_fig_file = os.path.join('figures',
'repeat_divergence_{}.pdf'.format(base_file))
alu_fig_file = os.path.join('figures',
'alu_divergence_{}.pdf'.format(base_file))
tad_loc = args.TAD_File

fasta_loc = os.path.join('data', 'hg19_fasta')


def load_fasta(chrom, fasta_loc):
"""
Retrieve fasta file

Arguments:
:param chrom: the chromosome of interest (format 'chr#')
:param fasta_loc: the location that stores the hg19 fasta files

Output:
fasta file for the given chromosome
"""

chrom_fa = os.path.join(fasta_loc, 'chr{}.fa'.format(chrom))
record = SeqIO.read(open(chrom_fa), 'fasta')

nucleotides = str(record.seq)
# FASTA file has upper and lowercase letters
# lower case = repetative elements
nucleotides = nucleotides.upper()

return nucleotides


def split_TAD_bins(tadlength, num_bins):
"""
Return a list of coordinates to partition the TAD

Arguments:
:param tadlength: how long the TAD is
:param num_bins: how many bins to split

Output:
a list of tuples with the locations for starting and ending bins
"""

avgbin = tadlength / num_bins
remainder = tadlength % num_bins
if remainder > 0:
randadd = random.sample(range(0, num_bins), remainder)
else:
randadd = []

return_list = []
current_idx = 0
for binID in range(0, num_bins):
next_idx = current_idx + avgbin
if binID in randadd:
return_list.append((current_idx + 1, next_idx + 1))
current_idx = next_idx + 1
else:
return_list.append((current_idx + 1, next_idx))
current_idx = next_idx
return return_list


def determine_gc_content(seq, start, end):
"""
Determine the gc content for a given sequence given bin coordinates

Arguments:
:param seq: a nucleotide sequence
:param start: where to subset the sequence
:param end: where to subset the sequence

Output:
A count of GC content within the specific coordinates
"""

import collections

start = int(start)
end = int(end)

subset_seq = seq[start:end]

c = collections.Counter(subset_seq)

# Known length will be zero if entire sequence is not known or `N`
known_length = sum(c[base] for base in 'ACTG')
GC = (c['G'] + c['C']) / known_length if known_length else 0.5

return GC


def get_gc_content(tad, seq, bins):
"""
Determine the gc content of all TADs across TAD bins

Arguments:
:param tad: a row in a TAD boundary DataFrame
:param seq: the SeqIO.seq object for the chromosome fasta
:param bins: int, number of bins to distribute gc content of TAD sequence

Output:
A pandas series of gc content of all TADs across bins
"""

tad_start = int(tad['start'])
tad_end = int(tad['end'])

tad_length = tad_end - tad_start

# Get the TAD bins
tad_bins = split_TAD_bins(tad_length, bins)

tad_sequence = seq[tad_start:tad_end]

# Now, loop over the TAD bins and extract GC Content
tad_gc = []

for coord in tad_bins:
start, end = coord

gc = determine_gc_content(tad_sequence, start, end)
tad_gc.append(gc)

return pd.Series(tad_gc)

# Load Data
tad_df = load_tad(tad_loc)
repeat_df = pd.read_table(repeat_index, index_col=0)
repeat_df = repeat_df.ix[~pd.isnull(repeat_df['TAD_id'])]
bin_r = repeat_df.apply(lambda x: assign_bin(x, bins=num_bins, ID='repeat'),
axis=1)
repeat_df = repeat_df.assign(tad_bin=bin_r)
repeat_df = repeat_df[repeat_df['tad_bin'] != -1]
alu_df = repeat_df[repeat_df['repeat'] == 'SINE/Alu']

# Plot divergence
p = sns.boxplot(x=repeat_df['tad_bin'], y=repeat_df['div'],
color=sns.xkcd_rgb['medium green'])
sns.despine()
p.set(xticklabels=xlab)
p.set(ylabel='Repeat Divergence', xlabel='TAD Bins')
p.set_title('')
plt.tight_layout()
plt.savefig(div_fig_file)
plt.close()

# ALU divergence
p = sns.boxplot(x=alu_df['tad_bin'], y=alu_df['div'],
color=sns.xkcd_rgb['medium green'])
sns.despine()
p.set(xticklabels=xlab)
p.set(ylabel='ALU Repeat Divergence', xlabel='TAD Bins')
p.set_title('')
plt.tight_layout()
plt.savefig(alu_fig_file)
plt.close()

# Plot GC content
gc_content_df = pd.DataFrame()
for chrom in tad_df['chromosome'].unique():
tad_sub = tad_df[tad_df['chromosome'] == chrom]
fasta = load_fasta(str(chrom), fasta_loc)
gc_content = tad_sub.apply(lambda x: get_gc_content(x, seq=fasta,
bins=num_bins), axis=1)
gc_content_df = gc_content_df.append(gc_content, ignore_index=True)

p = sns.boxplot(data=gc_content_df, color=sns.xkcd_rgb['medium green'])
sns.despine()
p.set(xticklabels=xlab)
p.set(ylabel='GC Content', xlabel='TAD Bins')
p.set_title('')
plt.tight_layout()
plt.savefig(gc_fig_file)
plt.close()
Loading