_batch_qiime

#!/bin/sh

export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism/seq_prisms 
export BATCH_QIIME_PRISM_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism


batch_qiime_version=$1

function read_answer_with_default() {
   read answer
   if [ -z "$answer" ]; then
      answer=$@
   fi
}


function get_run_opts() {

   DRY_RUN=no
   DEBUG=no
   HPC_TYPE=slurm
   FILES=""
   OUT_ROOT=""
   BLAST_DATABASE=""
   TAX_DATA=""
   SIMILARITY=""
   SEQLENGTH_MIN=""
   FILE_TYPE=""

   PROCESSING_ROOT=/dataset/gseq_processing/scratch/batch_qiime
   PROCESSING_ROOT=/dataset/gseq_processing/scratch/batch_qiime


   echo "*** batch_qiime analysis version $batch_qiime_version ***
* note that you can paste into your terminal window by clicking your right mouse button
* you can select text in the terminal window using your left mouse button
* at any stage you can press CTRL-C to exit the dialogs
* if you would prefer to run a single batch command, use batch_qiime_prism.sh (-h for help) - e.g. 
  this allows you to run the analysis on any collection of input files
"

   ####### get and check DATASET
   while [ 1 ] ; do
      echo "
please give the name of the dataset you would like to process (e.g. PJ_MGS00100 ), or a folder containing your sequence data 
"
      read DATASET
      if [ ! -z "$DATASET" ]; then 
         if [ -d $DATASET ]; then
            break
         fi
         if [ ! -d /dataset/${DATASET}/scratch  ]; then 
            echo "sorry can't find (e.g.) /dataset/${DATASET}/scratch (and $DATASET is not an accessible folder)"
         else
            break
         fi
      fi

   done

   if [ -d ${DATASET} ]; then
      echo "will look for data in ${DATASET}" 
   else 
      echo "will look for data in /dataset/${DATASET}/scratch"
   fi


   ####### get and check whether to run locally or on the cluster
   echo "

should this run be queued on the compute cluster ? (y/n, default=y. If n, will be run locally)
"
   read_answer_with_default y
   if [ "$answer" != "n" ]; then 
      HPC_TYPE=slurm
   else
      HPC_TYPE=local
   fi
   

   # set up folder
   if [ -d $DATASET ]; then
      # try removing the first component of the path and then appending that to processing_root
      OUTPUT_ROOT=`echo $DATASET | cut -d'/' -f3-`
      OUTPUT_ROOT=$PROCESSING_ROOT/$OUTPUT_ROOT
   else
      OUTPUT_ROOT=$PROCESSING_ROOT/$DATASET
   fi

   while [ 1 ]; do
      echo "please specify output base folder (or just press Enter/Return to use default , $OUTPUT_ROOT )"
      read_answer_with_default $OUTPUT_ROOT 
      NEW_ROOT=$answer
      if [ -d $NEW_ROOT ]; then
         echo "warning - $NEW_ROOT already exists, use anyway ? (y/n, default=y)"
         read_answer_with_default y
         if [[ ( $answer == "y" ) || ( -z $answer ) ]]; then
            OUTPUT_ROOT=$NEW_ROOT
            break
         fi
      else
         mkdir -p $NEW_ROOT
         if [ -d $NEW_ROOT ]; then
            OUTPUT_ROOT=$NEW_ROOT
            break
         fi
      fi
   done

   echo "will use output root folder $OUTPUT_ROOT 

   "

   
   ####### get and check the analysis type 
   while [ 1 ] ; do
      ANALYSIS=qiime
      echo "
please give which analysis you want (all, clean, qiime, kmer_analysis) (or just press enter to run qiime)  
(notes :
* entering qiime includes length filtering, clustering, taxonomy assignment and summarising) 
* the clean step is currently run seperately , and is not done if you select all or qiime 
)

"
      read_answer_with_default qiime
      ANALYSIS=$answer

      if [[ ( "$ANALYSIS" != "all" ) && ( "$ANALYSIS" != "clean" ) && ( "$ANALYSIS" != "qiime" ) && ( "$ANALYSIS" != "kmer_analysis" ) ]]; then
         echo "analysis must be one of all, clean, qiime, kmer_analysis"
      else
         break
      fi
   done

   if [[ ( "$ANALYSIS" == "all" ) || ( "$ANALYSIS" == "qiime" ) ]]; then
      ls $OUTPUT_ROOT/qiime_analysis/* > /dev/null 2>&1
      if [ $? == 0 ] ; then
         echo "found existing results under $OUTPUT_ROOT/qiime_analysis/* - are you sure you want to continue (e.g. complete an interrupted run) ? (y/n)"
         read_answer_with_default n
         if [[ ( $answer != "y" )  && ( $answer != "Y" ) ]]; then
            echo "ok quitting - please either use a different output folder or clean up $OUTPUT_ROOT/qiime_analysis"
            exit 1
         fi
      fi
   fi
   if [[ ( "$ANALYSIS" == "all" ) || ( "$ANALYSIS" == "kmer_analysis" ) ]]; then
      ls $OUTPUT_ROOT/kmer_analysis/* > /dev/null 2>&1
      if [ $? == 0 ] ; then
         echo "found existing results under $OUTPUT_ROOT/kmer_analysis - are you sure you want to continue (e.g. complete an interrupted run) ? (y/n)"
         read_answer_with_default n
         if [[ ( $answer != "y" )  && ( $answer != "Y" ) ]]; then
            echo "ok quitting - please use a different output folder"
            exit 1
         fi
      fi
   fi

   echo "will use analysis=$ANALYSIS
   "


   ####### get and check the blast data to use 
   if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
      while [ 1 ] ; do
         echo "
please give the full path to the blast database (or just press enter to use default, /dataset/datacache/archive/metagenomics/build/combined_092016_silva.fasta) 
"
         read_answer_with_default /dataset/datacache/archive/metagenomics/build/combined_092016_silva.fasta
         BLAST_DATABASE=$answer

         if [ -f ${BLAST_DATABASE}.nin ]; then
            break
         else
            echo "could not find blast index file ${BLAST_DATABASE}.nin"
         fi
      done

      echo "will use blast database $BLAST_DATABASE"
   fi


   ####### get and check the tax data to use 
   if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
      while [ 1 ] ; do
         echo "
please give the full path to the taxonomy data (or just press enter to use default, /dataset/datacache/archive/metagenomics/build/combined_092016_silva.taxonomy) 
"
         read_answer_with_default /dataset/datacache/archive/metagenomics/build/combined_092016_silva.taxonomy 
         TAX_DATA=$answer

         if [ -f $TAX_DATA ]; then
            break
         else
            echo "could not find tax data $TAX_DATA"
         fi
      done

      echo "will use tax data $TAX_DATA"
   fi


   ####### get and check the similarity to use  
   if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
      while [ 1 ] ; do
         echo "
please specify the similarity to use for clustering (or just press enter to use default, 0.99) 
"
         read_answer_with_default 0.99
         SIMILARITY=$answer

         python -c "print float('$SIMILARITY')" >/dev/null 2>&1
         if [ $? != 0 ]; then
            echo "looks like similarity requested ( $SIMILARITY ) is not a number"
            exit 1
         else
            break
         fi
      done

      echo "will use similarity $SIMILARITY"
   fi


   ####### get and check the minimum length to use  
   if [[ ( $ANALYSIS == "all" ) || ( $ANALYSIS == "qiime" ) ]]; then
      while [ 1 ] ; do
         SEQLENGTH_MIN=200 
         echo "
please specify the minimum seq length (or just press enter to use default, 200) 
"
         read_answer_with_default 200
         SEQLENGTH_MIN=$answer

         python -c "print float('$SEQLENGTH_MIN')" >/dev/null 2>&1
         if [ $? != 0 ]; then
            echo "looks like minimum seqlength requested ( $SEQLENGTH_MIN ) is not a number"
            exit 1
         else
            break
         fi
      done

      echo "will use minimum seqlength $SEQLENGTH_MIN"
   fi
} # get_run_opts


function run_qiime_analysis() {
   mkdir -p $OUTPUT_ROOT/qiime_analysis

   if [ ! -d $OUTPUT_ROOT/qiime_analysis ]; then
      echo "unable to create output folder $OUTPUT_ROOT/qiime_analysis - quitting"
      exit 1
   fi

   if [ -d $DATASET ]; then
      FILE_PATH=$DATASET
   else
      FILE_PATH=/dataset/$DATASET/scratch     
   fi

   while [ 1 ]; do
      if [ -f $OUTPUT_ROOT/input_file_list.txt ]; then 
         break
      fi

      echo "
finding files to process..."
      find $FILE_PATH/ -name "*_R1_*fastq*.gz" -print > $OUTPUT_ROOT/input_file_list.txt 2>/dev/null
      return_code=$?
      if [ $return_code == 0 ]; then
         num_files=`cat $OUTPUT_ROOT/input_file_list.txt | wc -l`
         if [ $num_files == 0 ]; then
            return_code=1
         fi
      fi 
      if [ $return_code != 0 ]; then
         # try just listing all .gz files , attempting to filter just R1
         ls $FILE_PATH/*.gz | egrep "(_R1_|_1\.f).*.gz$" > $OUTPUT_ROOT/input_file_list.txt 2>/dev/null
         if [ $? != 0 ]; then
            echo "unable to find any files like *.gz under $FILE_PATH/"
            while [ 1 ]; do
               echo "please enter path to to files (or CTRL-C and edit $OUTPUT_ROOT/input_file_list.txt so it contains the names of the files to process)"
               read $FILE_PATH
               echo "
finding files to process..."
               find $FILE_PATH/ -name "*_R1_*fastq*.gz" -print > $OUTPUT_ROOT/input_file_list.txt  2>/dev/null
               return_code=$?
               if [ $return_code == 0 ]; then
                  num_files=`cat $OUTPUT_ROOT/input_file_list.txt | wc -l`
                  if [ $num_files == 0 ]; then
                     return_code=1
                  fi
               fi
               if [ $return_code == 0 ]; then
                  break
               else
                  ls $FILE_PATH/*.gz | egrep "(_R1_|_1\.f).*.gz$" > $OUTPUT_ROOT/input_file_list.txt 2>/dev/null
                  if [ $? == 0 ]; then
                     break
                  fi
               fi
            done
         fi
      fi

   done


   # confirm file list
   echo "will process the following files: . . . (press Enter/Return for a list - then press space bar to page through listing)"
   read answer
   more $OUTPUT_ROOT/input_file_list.txt
   echo "OK ? (default = y)"
   read_answer_with_default y
   if [ $answer == "y" ]; then
      break
   else
      echo "
ok quitting - you can edit the file $OUTPUT_ROOT/input_file_list.txt in order to customise the
files to process, or remove it so that the data folder is re-scanned for files to process, and try again"
      exit 1
   fi


   # check we can parse sample names from filenames - if not prompt for a custom regular expression 
   echo "
   checking we can parse sample ID from filenames. . . "
   regexp='^[^_]+_(\S+)_'
   while [ 1 ]; do
      for file in `cat $OUTPUT_ROOT/input_file_list.txt`; do
         $BATCH_QIIME_PRISM_BIN/add_sample_name.py -n -r "$regexp" $file
         return_code=$?
         if [ $return_code != 0 ]; then
            echo "Could not parse sample name from $file using a \"$regexp\" - please enter a regular expression to use ( e.g. something like ^([^_]+)_  or ^([^_]+_L1)_ might work)" 
            read_answer_with_default $regexp
            regexp=$answer
            break
         fi
      done
      if [ $return_code == 0 ]; then
         echo "do those sample names look OK ? (y/n)"
         read_answer_with_default y
         if [ $answer != "y" ]; then
            echo "please enter a suitable regular expression. Examples : ^[^_]+_(\S+)_    ^([^_]+)_    ^([^_]+_L1)_ "
            read_answer_with_default $regexp
            regexp=$answer
         else 
            break
         fi
      fi
   done


   echo "
export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism/seq_prisms
export BATCH_QIIME_PRISM_BIN=/dataset/gseq_processing/active/bin/batch_qiime_prism
time $BATCH_QIIME_PRISM_BIN/batch_qiime_prism.sh -x \"$regexp\" -b $BLAST_DATABASE -t $TAX_DATA -m $SEQLENGTH_MIN -s $SIMILARITY  -O  $OUTPUT_ROOT/qiime_analysis \`cat $OUTPUT_ROOT/input_file_list.txt\`" > $OUTPUT_ROOT/run.src


   echo "about to process  data files listed in $OUTPUT_ROOT/input_file_list.txt, writing results to $OUTPUT_ROOT/qiime_analysis"
   echo "using . . ."
   cat  $OUTPUT_ROOT/run.src
   echo "press enter to continue (or CTRL-C to cancel)"
   read junk 
   set -x
   time $BATCH_QIIME_PRISM_BIN/batch_qiime_prism.sh -x "$regexp" -b $BLAST_DATABASE -t $TAX_DATA -m $SEQLENGTH_MIN -s $SIMILARITY  -O  $OUTPUT_ROOT/qiime_analysis `cat $OUTPUT_ROOT/input_file_list.txt`
   set +x

}


function run_kmer_analysis() {
   # set up the kmer anlaysis folder and large-ram config
   mkdir -p  $OUTPUT_ROOT/kmer_analysis
   mkdir -p  $OUTPUT_ROOT/etc
   cp $BATCH_QIIME_PRISM_BIN/etc/64GBRAM_slurm_array_job $OUTPUT_ROOT/etc 
   rm -f $OUTPUT_ROOT/etc/tardis.toml    # in case left over from previous kmer_prism run , which may modify this
   echo "jobtemplatefile = \"$OUTPUT_ROOT/etc/64GBRAM_slurm_array_job\"" > $OUTPUT_ROOT/etc/tardis.toml


   # run kmer analysis
   if [ -f $OUTPUT_ROOT/kmer_analysis/kmer_entropy.k6.jpg ]; then
      echo "skipping kmer analysis as landmark $OUTPUT_ROOT/kmer_analysis/kmer_entropy.k6.jpg exists"
   else
      echo "about to run a kmer analysis for files listed in $OUTPUT_ROOT/input_file_list.txt, writing results to $OUTPUT_ROOT/kmer_analysis"
      echo "press enter to continue (or CTRL-C to cancel)"
      read junk
      set -x
      $SEQ_PRISMS_BIN/kmer_prism.sh -s .05 -a fasta -O  $OUTPUT_ROOT/kmer_analysis `cat $OUTPUT_ROOT/input_file_list.txt`  1>$OUTPUT_ROOT/kmer_analysis.stdout 2>$OUTPUT_ROOT/kmer_analysis.stderr
      set +x
   fi
}

get_run_opts


if [[ ( $ANALYSIS == "qiime" ) || ( $ANALYSIS == "all" ) ]]; then
   run_qiime_analysis
fi
if [[ ( $ANALYSIS == "kmer_analysis" ) || ( $ANALYSIS == "all" ) ]]; then
   run_kmer_analysis
fi