Skip to content

Commit

Permalink
determine_ibd script and helper updates
Browse files Browse the repository at this point in the history
  • Loading branch information
washedgram committed Sep 12, 2024
1 parent 58ab9ac commit edfae00
Show file tree
Hide file tree
Showing 36 changed files with 14,895 additions and 157,855 deletions.
7 changes: 4 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
lib/hapmap3/*
lib/KDE_data/*
lib/1KG/*
lib/perl_modules/File
lib/perl_modules/Getopt
testing/*
__pycache__/
output/*
output/*
alyssa/*
example_data/
compadre_testing.ipynb
15 changes: 8 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@ COPY . .
# this might not be necessary anymore -- symbolic link to 'old' plink that primus might be expecting
RUN ln -s /bin/plink1.9 /bin/plink

# Download reference data
# RUN wget https://compadre.dev/api/data/primus_reference_data.tgz -O lib/reference_data/primus_reference_data.tgz && \
# tar -xzvf lib/reference_data/primus_reference_data.tgz -C lib/reference_data && \
# rm lib/reference_data/primus_reference_data.tgz
# ## ^ This assumes that the unzipped folders in the .tgz download are correctly named, need to double check this

# Install python packages
RUN pip3 install --no-cache-dir -r requirements.txt

#######
## IN PROGRESS: install reference data from github releases URL
RUN wget https://github.com/belowlab/compadre/releases/download/v1.0.0/reference-data.dat

## after downloading, move it into the lib folder accordingly
#######

# Install the KernSmooth R package
RUN Rscript -e "install.packages('KernSmooth', repos='http://cran.rstudio.com/')"

Expand All @@ -37,4 +38,4 @@ ENV PERL5LIB=$PERL_PATH:$PERL_PATH/lib/perl5:/usr/src/lib/perl_modules/:/usr/src

# Set primus executable entrypoint
WORKDIR /usr/src/bin
ENTRYPOINT ["perl", "./run_PRIMUS.pl"]
ENTRYPOINT ["perl", "./run_PRIMUS.pl"]
5 changes: 3 additions & 2 deletions bin/primus_kickoff7.pl
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ sub print_files_and_settings
my $libpath = $lib_dir;
$libpath =~ s{/$}{};
my $parent_dir = File::Spec->catdir(dirname($libpath));
my $helper_path = File::Spec->catfile($parent_dir, 'compadre_helper_new.py');
my $helper_path = File::Spec->catfile($parent_dir, 'helper.py');

# instantiate the new compadre helper using the filepath from $match_data
my ($reader, $writer);
Expand All @@ -456,7 +456,8 @@ sub print_files_and_settings
# Wait for the server to be ready
while (my $line = <$reader>) {
if ($line =~ /COMPADRE helper socket is ready/) {
print "\nCOMPADRE helper socket is ready\n";
chomp $line;
print "\n$line\n";
last;
}
}
Expand Down
36 changes: 23 additions & 13 deletions ersa.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,8 @@ def ibd2_sib_ll(segment_list):
return r_ll

def add_segment(ind_sharing,ind_id,cm,controls="no"):

# ind_sharing is the same as ibd2_dict

if abs(cm)>=options.min_cm:
if controls=="no" or cm<=options.max_cm:
Expand Down Expand Up @@ -364,7 +366,7 @@ def process_segment(chromosome,ascertained_dict,sharing_dict,ibd2_dict,ind_id,cm
add_segment(sharing_dict,ind_id,new_cm,controls)
masked_sum[ind_id]+=cm-new_cm

else:
else: # IBD2=="yes"
[new_begin,new_end]=get_masked_coordinates(chromosome,begin_position,end_position,masked_segments_dict)
new_cm=get_cm(new_begin,new_end,begin_position,end_position,cm,recombination_rates)
add_segment(ibd2_dict,ind_id,new_cm,controls)
Expand Down Expand Up @@ -438,8 +440,6 @@ def get_confidence_levels(models,max_model_id,max_model_ll,confidence_statistic,
row_df = pd.DataFrame([df_info])
model_df = pd.concat([model_df, row_df], ignore_index=True)



if model.ml+confidence_statistic>=max_model_ll:
if model.ancestors==0:
n_0p.append(model.meioses)
Expand Down Expand Up @@ -474,7 +474,7 @@ def get_confidence_levels(models,max_model_id,max_model_ll,confidence_statistic,
def add_segments(beagle_marker_dict,rec_dict,chromosome_positions,sharing_dict,seg_input,recombination_rates,pairs,masked_segments_dict,ascertained_dict={},ibd2_dict={},control_ind=None,control_segments=None,masked_sum={}):

ind_dict={}
IBD2="no"


'''handling recombination file stuff'''
if not options.recombination_files is None:
Expand Down Expand Up @@ -530,14 +530,8 @@ def add_segments(beagle_marker_dict,rec_dict,chromosome_positions,sharing_dict,s
if pairs["cases"]==2:
all_cases="yes"

'''
update 6.30.24
check if 'filename' is a dict or not
if it is, process differently
'''

if type(seg_input) == str:
if type(seg_input) == str: # Previously standard behavior (passing in a file location for the .match file)

start_time = datetime.now()

Expand Down Expand Up @@ -629,20 +623,28 @@ def add_segments(beagle_marker_dict,rec_dict,chromosome_positions,sharing_dict,s
if options.verbose:
print(f"Time spent (FILE): {total_seconds} seconds\n")

else: # seg_input is a dict
else: # Dict object processing, usually for pairwise calculation

'''
The segment dictionary now has another variable in the k:v tuple ~ 'NA' if no IBD data, otherwise 1 or 2
(chrom, start, end, cmlen, ibd)
'''

if options.verbose:
print ('Handling dict of segment information\n')

# start converting from line 548
start_time = datetime.now()

for ind_id, segments in seg_input.items():
for ind_id, segments in seg_input.items(): # Keys in the dict

ids = ind_id.split(':')
id1, id2 = ids[0], ids[1]

for segment_tuple in segments: # iterate through all the segments stored for this pair

### MOVED IBD2 variable into the loop because that might have been messing it up?

ind_dict[id1] = 1
ind_dict[id2] = 1

Expand All @@ -656,6 +658,14 @@ def add_segments(beagle_marker_dict,rec_dict,chromosome_positions,sharing_dict,s

begin_position, end_position, cm = segment_tuple[1], segment_tuple[2], segment_tuple[3]

# New 9/3/24 ~ check IBD data in the tuple (NA, 1, or 2)
ibd_status = segment_tuple[4]
if ibd_status == 2:
#print ('yes')
IBD2 = 'yes'
else:
IBD2 = "no"

process_segment(chromosome, ascertained_dict, sharing_dict, ibd2_dict, ind_id, cm, controls, begin_position, end_position, recombination_rates, IBD2, control_segments, masked_segments_dict, masked_sum)

end_time = datetime.now()
Expand Down
11 changes: 0 additions & 11 deletions example_data/real/1K_genomes_MEX_family.features

This file was deleted.

46 changes: 0 additions & 46 deletions example_data/real/1K_genomes_MEX_family.genome

This file was deleted.

Binary file removed example_data/real/MEX_pop.bed
Binary file not shown.
Loading

0 comments on commit edfae00

Please sign in to comment.