diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..4a9ffde --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,30 @@ +--- +name: Bug report +about: Create a report to help us improve +title: "[BUG]" +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behaviour, including the +1. Command executed +2. Error message + +**Expected behaviour** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Desktop (please complete the following information):** + - OS: [e.g. Ubuntu] + - OS version [e.g. 20.04 LTS] + - Python version [e.g. 3.7] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md new file mode 100644 index 0000000..48d5f81 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/custom.md @@ -0,0 +1,10 @@ +--- +name: Custom issue template +about: Describe this issue template's purpose here. +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bbcbbe7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..9898434 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,76 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "develop" ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ "develop" ] + schedule: + - cron: '34 3 * * 1' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Use only 'java' to analyze code written in Java, Kotlin or both + # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # ℹī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml new file mode 100644 index 0000000..bdaab28 --- /dev/null +++ b/.github/workflows/pypi-publish.yml @@ -0,0 +1,39 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/testing_python_app.yml b/.github/workflows/testing_python_app.yml index ec970f0..963b3b7 100644 --- a/.github/workflows/testing_python_app.yml +++ b/.github/workflows/testing_python_app.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [ master ] + branches: [ develop ] pull_request: - branches: [ master ] + branches: [ develop ] jobs: @@ -14,16 +14,16 @@ jobs: strategy: matrix: - os: [macos-latest, ubuntu-latest] - python-version: ["3.7", "3.8", "3.9", "3.10"] + os: [macos-12, ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: "actions/checkout@v2" + - uses: "actions/checkout@v3" with: fetch-depth: 0 # Setup env - - uses: "actions/setup-python@v2" + - uses: "actions/setup-python@v3" with: python-version: "${{ matrix.python-version }}" @@ -35,10 +35,11 @@ jobs: - name: "Generate coverage report on ${{ matrix.os }} for Python ${{ matrix.python-version }}" run: | pip install pytest pytest-cov - pytest --cov=tests --cov-report=xml --cov-append + pytest --cov=graphbin --cov-report=xml --cov-append - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3 with: + token: ${{ secrets.CODECOV_TOKEN }} file: coverage.xml fail_ci_if_error: true diff --git a/.github/workflows/testing_python_conda.yml b/.github/workflows/testing_python_conda.yml new file mode 100644 index 0000000..78291a5 --- /dev/null +++ b/.github/workflows/testing_python_conda.yml @@ -0,0 +1,46 @@ +name: CI conda + +on: + push: + branches: [ develop ] + pull_request: + branches: [ develop ] + + +jobs: + tests: + name: "Python ${{ matrix.python-version }}" + runs-on: ${{ matrix.os }} + + defaults: + run: + shell: bash -el {0} + + strategy: + matrix: + os: [macos-12, ubuntu-latest] + python-version: ["3.8", "3.9", "3.10"] + + steps: + - uses: "actions/checkout@v3" + with: + fetch-depth: 0 + + # Setup conda env + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: graphbin + environment-file: environment.yml + python-version: ${{ matrix.python-version }} + auto-activate-base: false + + - name: "Setup graphbin on ${{ matrix.os }} for Python ${{ matrix.python-version }}" + run: | + python -m pip install --upgrade pip + pip install . + + - name: "Generate coverage report on ${{ matrix.os }} for Python ${{ matrix.python-version }}" + run: | + pip install pytest pytest-cov + pytest --cov=graphbin --cov-report=xml --cov-append + \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..96f6364 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,67 @@ +# Contributing to GraphBin project + +We love to have your contributions to the GraphBin project, whether it's: +* Reporting a bug +* Submitting a fix +* Proposing new features + +## Clone and install GraphBin onto your machine + +First, make sure you have [git](https://github.com/git-guides/install-git) installed on your machine. + +On GitHub, [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the GraphBin repository and [clone](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) it to your machine. + +``` +# clone repository to your local machine +git clone https://github.com/metagentools/GraphBin.git +``` + +Move to the GraphBin directory and install GraphBin via [flit](https://pypi.org/project/flit/). + +``` +# go to repo direcotry +cd GraphBin + +# install flit +pip install flit + +# install graphbin via flit +flit install -s --python `which python` +``` + +## Test GraphBin installation + +Use the following command to run [pytest](https://docs.pytest.org/en/7.1.x/) and the all the tests should pass. + +``` +pytest +``` + +## Coding Style + +We adhere to the [PEP 8](https://peps.python.org/pep-0008/) style guide. + +Before committing, make sure to run [`black`](https://pypi.org/project/black/) and [`isort`](https://pypi.org/project/isort/). + +## Report bugs using Github's issues + +We use GitHub issues to track public bugs. Report a bug by opening a new issue in GitHub [issues](https://github.com/metagentools/GraphBin/issues). You will get to select between templates for bug report and feature request. + +## Committing code + +Once you have finished coding and all the tests pass, commit your code and make a pull request. Make sure to follow the commit style of [c3dev](https://github.com/cogent3/c3dev/wiki#style-for-commit-messages). + +``` +git commit -m "" +git push +``` + +Your contribution will be reviewed before accepting it. + +## License + +By contributing, you agree that your contributions will be licensed under the BSD-3 License. + +## References + +This document was adapted from the open-source contribution guidelines for [Transcriptase](https://github.com/briandk/transcriptase-atom/blob/master/CONTRIBUTING.md) and [c3dev](https://github.com/cogent3/c3dev/wiki/How-to-Contribute-Code). \ No newline at end of file diff --git a/GraphBin_logo_dark.png b/GraphBin_logo_dark.png new file mode 100644 index 0000000..5f1121d Binary files /dev/null and b/GraphBin_logo_dark.png differ diff --git a/GraphBin_logo_light.png b/GraphBin_logo_light.png new file mode 100644 index 0000000..85c8ada Binary files /dev/null and b/GraphBin_logo_light.png differ diff --git a/README.md b/README.md index 65cd419..d8007dc 100644 --- a/README.md +++ b/README.md @@ -1,47 +1,63 @@

- Final Labelling + GraphBin logo + GraphBin logo

# GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs +[![DOI](https://img.shields.io/badge/DOI-10.1093/bioinformatics/btaa180-informational)](https://doi.org/10.1093/bioinformatics/btaa180) +[![Anaconda-Server Badge](https://anaconda.org/bioconda/graphbin/badges/version.svg)](https://anaconda.org/bioconda/graphbin) +[![Anaconda-Server Badge](https://anaconda.org/bioconda/graphbin/badges/downloads.svg)](https://anaconda.org/bioconda/graphbin) +[![PyPI version](https://badge.fury.io/py/graphbin.svg)](https://badge.fury.io/py/graphbin) +[![Downloads](https://static.pepy.tech/badge/graphbin)](https://pepy.tech/project/graphbin) + [![CI](https://github.com/metagentools/GraphBin/actions/workflows/testing_python_app.yml/badge.svg)](https://github.com/metagentools/GraphBin/actions/workflows/testing_python_app.yml) -[![codecov](https://codecov.io/gh/metagentools/GraphBin/branch/master/graph/badge.svg?token=0S310F6QXJ)](https://codecov.io/gh/metagentools/GraphBin) +[![codecov](https://codecov.io/gh/metagentools/GraphBin/branch/develop/graph/badge.svg?token=0S310F6QXJ)](https://codecov.io/gh/metagentools/GraphBin) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Anaconda-Server Badge](https://anaconda.org/bioconda/graphbin/badges/installer/conda.svg)](https://anaconda.org/bioconda/graphbin) -[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Vini2/GraphBin.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Vini2/GraphBin/context:python) +[![CodeQL](https://github.com/metagentools/GraphBin/actions/workflows/codeql.yml/badge.svg)](https://github.com/metagentools/GraphBin/actions/workflows/codeql.yml) [![Documentation Status](https://readthedocs.org/projects/graphbin/badge/?version=latest)](https://graphbin.readthedocs.io/en/latest/?badge=latest) -**GraphBin** is a NGS data-based metagenomic contig bin refinment tool that makes use of the contig connectivity information from the assembly graph to bin contigs. It utilizes the binning result of an existing binning tool and a label propagation algorithm to correct mis-binned contigs and predict the labels of contigs which are discarded due to short length. +**GraphBin** is an NGS data-based metagenomic contig bin refinement tool that makes use of the contig connectivity information from the assembly graph to bin contigs. It utilizes the binning result of an existing binning tool and a label propagation algorithm to correct mis-binned contigs and predict the labels of contigs which are discarded due to short length. **For detailed instructions on installation, usage and visualisation, please refer to the [documentation hosted at Read the Docs](https://graphbin.readthedocs.io/).** ## Dependencies -GraphBin installation requires python 3 (tested on Python 3.6 and 3.7). The following dependencies are required to run GraphBin and related support scripts. -* [python-igraph](https://igraph.org/python/) - version 0.7.1 -* [biopython](https://biopython.org/) - version 1.74 +GraphBin installation requires python 3 to run. The following dependencies are required to run GraphBin and related support scripts. +* [python-igraph](https://igraph.org/python/) +* [cogent3](https://cogent3.org/) * [cairocffi](https://pypi.org/project/cairocffi/) +* [click](https://click.palletsprojects.com/) ## Installing GraphBin ### Using Conda -You can install GraphBin via [Conda](https://docs.conda.io/en/latest/). You can download [Anaconda](https://www.anaconda.com/distribution/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) which contains Conda. +You can install GraphBin using the [bioconda](https://anaconda.org/bioconda/graphbin) distribution. You can download +[Anaconda](https://www.anaconda.com/distribution/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) which contains `conda`. ``` -# create conda environment and install -conda create -n graphbin -c bioconda graphbin +# add channels +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge + +# create conda environment +conda create -n graphbin # activate conda environment conda activate graphbin +# install graphbin +conda install -c bioconda graphbin + # check graphbin installation graphbin -h ``` ### Using pip -You can install GraphBin using pip. +You can install GraphBin using `pip` from the [PyPI](https://pypi.org/project/graphbin/) distribution. ``` pip install graphbin @@ -53,7 +69,7 @@ For ***development*** purposes, please clone the repository and install via [fli # clone repository to your local machine git clone https://github.com/metagentools/GraphBin.git -# go to repo direcotry +# go to repo directory cd GraphBin # install flit @@ -107,7 +123,7 @@ graphbin --assembler megahit --graph /path/to/graph_file.gfa --contigs /path/to/ ## Citation If you use GraphBin in your work, please cite GraphBin as, -Vijini Mallawaarachchi, Anuradha Wickramarachchi, Yu Lin. GraphBin: Refined binning of metagenomic contigs using assembly graphs. Bioinformatics, Volume 36, Issue 11, June 2020, Pages 3307–3313, DOI: [10.1093/bioinformatics/btaa180](http://dx.doi.org/10.1093/bioinformatics/btaa180) +Vijini Mallawaarachchi, Anuradha Wickramarachchi, Yu Lin. GraphBin: Refined binning of metagenomic contigs using assembly graphs. Bioinformatics, Volume 36, Issue 11, June 2020, Pages 3307–3313, DOI: [https://doi.org/10.1093/bioinformatics/btaa180](https://doi.org/10.1093/bioinformatics/btaa180) ```bibtex @article{10.1093/bioinformatics/btaa180, @@ -119,10 +135,18 @@ Vijini Mallawaarachchi, Anuradha Wickramarachchi, Yu Lin. GraphBin: Refined binn pages = {3307-3313}, year = {2020}, month = {03}, - abstract = "{The field of metagenomics has provided valuable insights into the structure, diversity and ecology within microbial communities. One key step in metagenomics analysis is to assemble reads into longer contigs which are then binned into groups of contigs that belong to different species present in the metagenomic sample. Binning of contigs plays an important role in metagenomics and most available binning algorithms bin contigs using genomic features such as oligonucleotide/k-mer composition and contig coverage. As metagenomic contigs are derived from the assembly process, they are output from the underlying assembly graph which contains valuable connectivity information between contigs that can be used for binning.We propose GraphBin, a new binning method that makes use of the assembly graph and applies a label propagation algorithm to refine the binning result of existing tools. We show that GraphBin can make use of the assembly graphs constructed from both the de Bruijn graph and the overlap-layout-consensus approach. Moreover, we demonstrate improved experimental results from GraphBin in terms of identifying mis-binned contigs and binning of contigs discarded by existing binning tools. To the best of our knowledge, this is the first time that the information from the assembly graph has been used in a tool for the binning of metagenomic contigs.The source code of GraphBin is available at https://github.com/Vini2/GraphBin.vijini.mallawaarachchi@anu.edu.au or yu.lin@anu.edu.auSupplementary data are available at Bioinformatics online.}", + abstract = "{The field of metagenomics has provided valuable insights into the structure, diversity and ecology within microbial communities. One key step in metagenomics analysis is to assemble reads into longer contigs which are then binned into groups of contigs that belong to different species present in the metagenomic sample. Binning of contigs plays an important role in metagenomics and most available binning algorithms bin contigs using genomic features such as oligonucleotide/k-mer composition and contig coverage. As metagenomic contigs are derived from the assembly process, they are output from the underlying assembly graph which contains valuable connectivity information between contigs that can be used for binning. We propose GraphBin, a new binning method that makes use of the assembly graph and applies a label propagation algorithm to refine the binning result of existing tools. We show that GraphBin can make use of the assembly graphs constructed from both the de Bruijn graph and the overlap-layout-consensus approach. Moreover, we demonstrate improved experimental results from GraphBin in terms of identifying mis-binned contigs and binning of contigs discarded by existing binning tools. To the best of our knowledge, this is the first time that the information from the assembly graph has been used in a tool for the binning of metagenomic contigs. The source code of GraphBin is available at https://github.com/Vini2/GraphBin.vijini.mallawaarachchi@anu.edu.au or yu.lin@anu.edu.auSupplementary data are available at Bioinformatics online.}", issn = {1367-4803}, doi = {10.1093/bioinformatics/btaa180}, url = {https://doi.org/10.1093/bioinformatics/btaa180}, eprint = {https://academic.oup.com/bioinformatics/article-pdf/36/11/3307/33329097/btaa180.pdf}, } ``` + +## Funding + +GraphBin is funded by an [Essential Open Source Software for Science Grant](https://chanzuckerberg.com/eoss/proposals/cogent3-python-apis-for-iq-tree-and-graphbin-via-a-plug-in-architecture/) from the Chan Zuckerberg Initiative. + +

+ +

diff --git a/docs/install.md b/docs/install.md index 06d523c..e5012c3 100644 --- a/docs/install.md +++ b/docs/install.md @@ -12,25 +12,40 @@ GraphBin installation requires python 3 (tested on Python 3.6 and 3.7). The foll ### Method 1: conda install -You can install GraphBin via [Conda](https://docs.conda.io/en/latest/). You can download [Anaconda](https://www.anaconda.com/distribution/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) which contains Conda. +You can install GraphBin using the [bioconda](https://anaconda.org/bioconda/graphbin) distribution. You can download +[Anaconda](https://www.anaconda.com/distribution/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) which contains `conda`. -Once you have installed Conda, you can install conda directly from the bioconda distribution using the command +Now let's add the required conda channels so we know the locations where packages are stored. + +``` +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge +``` + +Once you have added the channels, you can install GraphBin directly from the bioconda distribution using the command ``` conda install -c bioconda graphbin ``` -You can also create a new conda environment and install GraphBin from bioconda using the following command and activate it. +You can also create a new conda environment and install GraphBin from bioconda using the following commands. ``` -conda create -n graphbin -c bioconda graphbin +# create conda environment +conda create -n graphbin + +# activate conda environment conda activate graphbin + +# install graphbin +conda install -c bioconda graphbin ``` ### Method 2: pip install -You can install GraphBin using pip. +You can install GraphBin using `pip` from the [PyPI](https://pypi.org/project/graphbin/) distribution. ``` pip install graphbin @@ -38,4 +53,4 @@ pip install graphbin After setup, check if GraphBin is properly installed by typing `graphbin -h` on the command line. You should see the usage options as shown in section [Using GraphBin](https://github.com/Vini2/GraphBin#using-graphbin) -Now let's prepare our results to run GraphBin. \ No newline at end of file +Now let's prepare our results to run GraphBin. diff --git a/docs/requirements.txt b/docs/requirements.txt index c6f008d..3e2bdf5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,18 @@ -jinja2<3.1.0 +jinja2==3.0.3 +mkdocs>=1.3.1 +babel>=2.9.0 +click>=7.0 +Jinja2>=2.10.2 +Markdown>=3.2.1,<3.4 +PyYAML>=5.2 +watchdog>=2.0.0 +mdx_gh_links>=0.2 +ghp-import>=1.0 +pyyaml_env_tag>=0.1 +mkdocs-redirects>=1.0.1 +importlib_metadata>=4.3 +packaging>=20.5 +mergedeep>=1.3.4 +pygments>=2.12 +pymdown-extensions +mkdocs-material diff --git a/environment.yml b/environment.yml index 9cb066b..b66813f 100644 --- a/environment.yml +++ b/environment.yml @@ -1,11 +1,13 @@ name: graphbin channels: - bioconda - - defaults + - conda-forge + - anaconda dependencies: - python>=3.7.1 + - cairocffi + - python-igraph + - click - pip - pip: - cogent3 - - cairocffi - - python-igraph>=0.7.1 diff --git a/pyproject.toml b/pyproject.toml index b88cde5..52fde30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ keywords = ["genomics", "bioinformatics"] readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.7" -dependencies = ["python-igraph", "cogent3", "cairocffi"] +dependencies = ["igraph", "cogent3", "cairocffi", "click"] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", diff --git a/requirements.txt b/requirements.txt index 23fb535..a8e025b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ cogent3 -python-igraph>=0.7.1 -cairocffi \ No newline at end of file +igraph>=0.7.1 +cairocffi +click \ No newline at end of file diff --git a/src/graphbin/__init__.py b/src/graphbin/__init__.py index 6b2bdb3..f2bfbed 100755 --- a/src/graphbin/__init__.py +++ b/src/graphbin/__init__.py @@ -2,185 +2,238 @@ """graphbin: Refined binning of metagenomic contigs using assembly graphs.""" +import logging import os import sys +import click + from graphbin.utils import ( graphbin_Canu, graphbin_Flye, graphbin_MEGAHIT, graphbin_Miniasm, - graphbin_Options, graphbin_SGA, graphbin_SPAdes, ) + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" -def run(args): - RUNNER = { - "canu": graphbin_Canu.run, - "flye": graphbin_Flye.run, - "megahit": graphbin_MEGAHIT.run, - "miniasm": graphbin_Miniasm.run, - "sga": graphbin_SGA.run, - "spades": graphbin_SPAdes.run, - } - RUNNER[args.assembler](args) - - -def main(): - parser = graphbin_Options.PARSER - parser.add_argument( - "--assembler", - type=str, - help="name of the assembler used (SPAdes, SGA or MEGAHIT). GraphBin supports Flye, Canu and Miniasm long-read assemblies as well.", - default="", - ) - parser.add_argument( - "--paths", - default=None, - required=False, - help="path to the contigs.paths file, only needed for SPAdes", - ) - parser.add_argument( - "--contigs", - default=None, - help="path to the contigs.fa file.", - ) - parser.add_argument( - "--delimiter", - required=False, - type=str, - default=",", - help="delimiter for input/output results. Supports a comma (,), a semicolon (;), a tab ($'\\t'), a space (\" \") and a pipe (|) [default: , (comma)]", - ) - - args = parser.parse_args() - - if args.version: - print("GraphBin version %s" % __version__) - sys.exit(0) - - # Validation of inputs - # --------------------------------------------------- - # Check assembler type - if len(sys.argv) == 1: - parser.print_help(sys.stderr) - sys.exit(1) - - args.assembler = args.assembler.lower() - - if not ( - args.assembler.lower() == "spades" - or args.assembler.lower() == "sga" - or args.assembler.lower() == "megahit" - or args.assembler.lower() == "flye" - or args.assembler.lower() == "canu" - or args.assembler.lower() == "miniasm" +class ArgsObj: + def __init__( + self, + assembler, + graph, + contigs, + paths, + binned, + output, + prefix, + max_iteration, + diff_threshold, + delimiter, ): - print( - "\nPlease make sure to provide the correct assembler type (SPAdes, SGA or MEGAHIT). GraphBin supports Flye, Canu and Miniasm long-read assemblies as well." - ) + self.assembler = assembler + self.graph = graph + self.contigs = contigs + self.paths = paths + self.binned = binned + self.output = output + self.prefix = prefix + self.max_iteration = max_iteration + self.diff_threshold = diff_threshold + self.delimiter = delimiter + + +@click.command() +@click.option( + "--assembler", + help="name of the assembler used (SPAdes, SGA or MEGAHIT). GraphBin supports Flye, Canu and Miniasm long-read assemblies as well.", + type=click.Choice( + ["spades", "sga", "megahit", "flye", "canu", "miniasm"], case_sensitive=False + ), + required=True, +) +@click.option( + "--graph", + help="path to the assembly graph file", + type=click.Path(exists=True), + required=True, +) +@click.option( + "--contigs", + help="path to the contigs file", + type=click.Path(exists=True), + required=True, +) +@click.option( + "--paths", + help="path to the contigs.paths (metaSPAdes) or assembly.info (metaFlye) file", + type=click.Path(exists=True), + required=False, +) +@click.option( + "--binned", + help="path to the .csv file with the initial binning output from an existing tool", + type=click.Path(exists=True), + required=True, +) +@click.option( + "--output", + help="path to the output folder", + type=click.Path(dir_okay=True, writable=True, readable=True), + required=True, +) +@click.option( + "--prefix", + help="prefix for the output file", + type=str, + required=False, +) +@click.option( + "--max_iteration", + help="maximum number of iterations for label propagation algorithm", + type=int, + default=100, + show_default=True, + required=False, +) +@click.option( + "--diff_threshold", + help="difference threshold for label propagation algorithm", + type=click.FloatRange(0, 1), + default=0.1, + show_default=True, + required=False, +) +@click.option( + "--delimiter", + help="delimiter for input/output results. Supports a comma (,), a semicolon (;), a tab ($'\\t'), a space (\" \") and a pipe (|)", + type=click.Choice([",", ";", "$'\\t'", '" "'], case_sensitive=False), + default=",", + show_default=True, + required=False, +) +@click.version_option(__version__, "-v", "--version", is_flag=True) +def main( + assembler, + graph, + contigs, + paths, + binned, + output, + prefix, + max_iteration, + diff_threshold, + delimiter, +): + """ + GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs + """ + + # Setup logger + # --------------------------------------------------- - print("\nExiting GraphBin...\nBye...!\n") - sys.exit(1) + logger = logging.getLogger("GraphBin %s" % __version__) + logger.setLevel(logging.DEBUG) + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + consoleHeader = logging.StreamHandler() + consoleHeader.setFormatter(formatter) + consoleHeader.setLevel(logging.INFO) + logger.addHandler(consoleHeader) - # Check assembly graph file - if not os.path.exists(args.graph): - print("\nFailed to open the assembly graph file.") + fileHandler = logging.FileHandler(f"{output}{prefix}graphbin.log") + fileHandler.setLevel(logging.DEBUG) + fileHandler.setFormatter(formatter) + logger.addHandler(fileHandler) - print("\nExiting GraphBin...\nBye...!\n") - sys.exit(1) + # Validate options + # --------------------------------------------------- # Check if paths files is provided when the assembler type is SPAdes - if args.assembler.lower() == "spades" and args.paths is None: - print("\nPlease make sure to provide the path to the contigs.paths file.") - - print("\nExiting GraphBin...\nBye...!\n") - sys.exit(1) - - # Check contigs.paths file for SPAdes - if args.assembler.lower() == "spades" and not os.path.exists(args.paths): - print("\nFailed to open the contigs.paths file.") - - print("\nExiting GraphBin...\nBye...!\n") - sys.exit(1) - - # Check if contigs.fa files is provided - if args.contigs is None: - print("\nPlease make sure to provide the path to the contigs file.") - - print("\nExiting GraphBin...\nBye...!\n") + if assembler.lower() == "spades" and paths is None: + logger.error("Please make sure to provide the path to the contigs.paths file.") + logger.info("Exiting GraphBin... Bye...!") sys.exit(1) - # Check contigs file - if not os.path.exists(args.contigs): - print("\nFailed to open the contigs file.") - - print("\nExiting GraphBin...\nBye...!\n") + # Check if paths files is provided when the assembler type is Flye + if assembler.lower() == "flye" and paths is None: + logger.error("Please make sure to provide the path to the contigs.paths file.") + logger.info("Exiting GraphBin... Bye...!") sys.exit(1) - # Check the file with the initial binning output - if not os.path.exists(args.binned): - print("\nFailed to open the file with the initial binning output.") - - print("\nExiting GraphBin...\nBye...!\n") - sys.exit(1) - - # Handle for missing trailing forwardslash in output folder path - if args.output[-1:] != "/": - args.output = args.output + "/" - - # Create output folder if it does not exist - os.makedirs(args.output, exist_ok=True) - # Validate prefix - if args.prefix != "": - if not args.prefix.endswith("_"): - args.prefix = args.prefix + "_" - - # Validate delimiter - delimiters = [",", ";", " ", "\t", "|"] - - if args.delimiter not in delimiters: - print("\nPlease enter a valid delimiter") - print("Exiting GraphBin...\nBye...!\n") - sys.exit(1) + if prefix != None: + if not prefix.endswith("_"): + prefix = prefix + "_" + else: + prefix = "" # Validate max_iteration - if args.max_iteration <= 0: - print("\nPlease enter a valid number for max_iteration") - - print("\nExiting GraphBin...\nBye...!\n") + if max_iteration <= 0: + logger.error("Please enter a valid number for max_iteration") + logger.info("Exiting GraphBin... Bye...!") sys.exit(1) # Validate diff_threshold - if args.diff_threshold < 0: - print("\nPlease enter a valid number for diff_threshold") - - print("\nExiting GraphBin...\nBye...!\n") + if diff_threshold < 0: + logger.error("Please enter a valid number for diff_threshold") + logger.info("Exiting GraphBin... Bye...!") sys.exit(1) - # Remove previous files if they exist - if os.path.exists(args.output + args.prefix + "graphbin.log"): - os.remove(args.output + args.prefix + "graphbin.log") - if os.path.exists(args.output + args.prefix + "graphbin_output.csv"): - os.remove(args.output + args.prefix + "graphbin_output.csv") - if os.path.exists(args.output + args.prefix + "graphbin_unbinned.csv"): - os.remove(args.output + args.prefix + "graphbin_unbinned.csv") + # # Remove previous files if they exist + # if os.path.exists(f"{output}{prefix}graphbin.log"): + # os.remove(f"{output}{prefix}graphbin.log") + # if os.path.exists(f"{output}{prefix}graphbin_output.csv"): + # os.remove(f"{output}{prefix}graphbin_output.csv") + # if os.path.exists(f"{output}{prefix}graphbin_unbinned.csv"): + # os.remove(f"{output}{prefix}graphbin_unbinned.csv") + + # Make args object + args = ArgsObj( + assembler, + graph, + contigs, + paths, + binned, + output, + prefix, + max_iteration, + diff_threshold, + delimiter, + ) # Run GraphBin # --------------------------------------------------- - run(args) + if assembler.lower() == "canu": + graphbin_Canu.main(args) + if assembler.lower() == "flye": + graphbin_Flye.main(args) + if assembler.lower() == "megahit": + graphbin_MEGAHIT.main(args) + if assembler.lower() == "miniasm": + graphbin_Miniasm.main(args) + if assembler.lower() == "sga": + graphbin_SGA.main(args) + if assembler.lower() == "spades": + graphbin_SPAdes.main(args) + + # Exit program + # -------------- + + logger.info("Thank you for using GraphBin! Bye...!") + + logger.removeHandler(fileHandler) + logger.removeHandler(consoleHeader) if __name__ == "__main__": diff --git a/src/graphbin/support/gfa2fasta.py b/src/graphbin/support/gfa2fasta.py index c347720..b4c39c0 100644 --- a/src/graphbin/support/gfa2fasta.py +++ b/src/graphbin/support/gfa2fasta.py @@ -13,6 +13,7 @@ from cogent3 import make_unaligned_seqs + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] @@ -83,7 +84,6 @@ # Validate prefix # --------------------------------------------------- try: - if args["prefix"] != "": if args["prefix"].endswith("_"): prefix = args["prefix"] @@ -105,12 +105,10 @@ sequences = {} with open(assembly_graph_file) as file: - for line in file.readlines(): line = line.strip() if line.startswith("S"): - strings = line.split("\t") print(strings) diff --git a/src/graphbin/support/prep_result.py b/src/graphbin/support/prep_result.py index bf0b4bb..f37f4ee 100644 --- a/src/graphbin/support/prep_result.py +++ b/src/graphbin/support/prep_result.py @@ -17,6 +17,7 @@ from cogent3.parse.fasta import MinimalFastaParser + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] @@ -127,7 +128,6 @@ # Validate prefix # --------------------------------------------------- try: - if args["prefix"] != "": if args["prefix"].endswith("_"): prefix = args["prefix"] @@ -159,11 +159,8 @@ contig_bins = [] for bin_file in files: - if bin_file.lower().endswith((".fasta", ".fa", ".fna")): - for contig_name, _ in MinimalFastaParser(contig_bins_folder + bin_file): - line = [contig_name, str(bin_file)] contig_bins.append(line) diff --git a/src/graphbin/support/visualise_result_Flye_Canu_Miniasm.py b/src/graphbin/support/visualise_result_Flye_Canu_Miniasm.py index e169404..df618ac 100644 --- a/src/graphbin/support/visualise_result_Flye_Canu_Miniasm.py +++ b/src/graphbin/support/visualise_result_Flye_Canu_Miniasm.py @@ -19,6 +19,7 @@ from bidirectionalmap.bidirectionalmap import BidirectionalMap from igraph import * + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] @@ -209,7 +210,6 @@ try: # Get contig connections from .gfa file with open(assembly_graph_file) as file: - for line in file.readlines(): line = line.strip() @@ -223,7 +223,6 @@ # Identify lines with link information elif line.startswith("L"): - link = [] strings = line.split("\t") @@ -250,7 +249,6 @@ # ------------------------------- try: - # Create the graph assembly_graph = Graph() diff --git a/src/graphbin/support/visualise_result_MEGAHIT.py b/src/graphbin/support/visualise_result_MEGAHIT.py index 91e633b..a0b6c42 100644 --- a/src/graphbin/support/visualise_result_MEGAHIT.py +++ b/src/graphbin/support/visualise_result_MEGAHIT.py @@ -21,6 +21,7 @@ from cogent3.parse.fasta import MinimalFastaParser from igraph import * + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] @@ -231,10 +232,8 @@ my_map = BidirectionalMap() try: - # Get links from .gfa file with open(assembly_graph_file) as file: - for line in file.readlines(): line = line.strip() diff --git a/src/graphbin/support/visualise_result_SGA.py b/src/graphbin/support/visualise_result_SGA.py index 74a9428..88ee0d1 100644 --- a/src/graphbin/support/visualise_result_SGA.py +++ b/src/graphbin/support/visualise_result_SGA.py @@ -20,6 +20,7 @@ from bidirectionalmap.bidirectionalmap import BidirectionalMap from igraph import * + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] @@ -244,7 +245,6 @@ # ------------------------------- try: - # Create the graph assembly_graph = Graph() diff --git a/src/graphbin/support/visualise_result_SPAdes.py b/src/graphbin/support/visualise_result_SPAdes.py index 1eeaa6e..88e8ff4 100644 --- a/src/graphbin/support/visualise_result_SPAdes.py +++ b/src/graphbin/support/visualise_result_SPAdes.py @@ -20,6 +20,7 @@ from bidirectionalmap.bidirectionalmap import BidirectionalMap from igraph import * + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] @@ -218,7 +219,6 @@ path = file.readline() while name != "" and path != "": - while ";" in path: path = path[:-2] + "," + file.readline() diff --git a/src/graphbin/utils/graphbin_Canu.py b/src/graphbin/utils/graphbin_Canu.py index da33fb9..73bcdb9 100755 --- a/src/graphbin/utils/graphbin_Canu.py +++ b/src/graphbin/utils/graphbin_Canu.py @@ -11,43 +11,32 @@ graphbin_Canu.py makes use of the assembly graphs produced by Canu long read assembler. """ -import csv import logging -import os -import subprocess -import sys import time -from cogent3.parse.fasta import MinimalFastaParser -from igraph import * +from graphbin.utils.graphbin_Func import graphbin_main +from graphbin.utils.parsers import get_initial_bin_count +from graphbin.utils.parsers.canu_parser import ( + get_initial_binning_result, + parse_graph, + write_output, +) -from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap -from graphbin.utils.graphbin_Func import getClosestLabelledVertices -from graphbin.utils.graphbin_Options import PARSER -from graphbin.utils.labelpropagation.labelprop import LabelProp __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" +# create logger +logger = logging.getLogger("GraphBin %s" % __version__) -def run(args): - # Setup logger - # ----------------------- - - logger = logging.getLogger("GraphBin %s" % __version__) - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - consoleHeader = logging.StreamHandler() - consoleHeader.setFormatter(formatter) - consoleHeader.setLevel(logging.INFO) - logger.addHandler(consoleHeader) +def run(args): start_time = time.time() assembly_graph_file = args.graph @@ -60,14 +49,6 @@ def run(args): diff_threshold = args.diff_threshold MIN_BIN_COUNT = 10 - # Setup output path for log file - # --------------------------------------------------- - - fileHandler = logging.FileHandler(output_path + "/" + prefix + "graphbin.log") - fileHandler.setLevel(logging.DEBUG) - fileHandler.setFormatter(formatter) - logger.addHandler(fileHandler) - logger.info( "Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs." ) @@ -75,531 +56,69 @@ def run(args): "This version of GraphBin makes use of the assembly graph produced by Canu which is a long reads assembler based on the OLC approach." ) - logger.info("Assembly graph file: " + assembly_graph_file) - logger.info("Existing binning output file: " + contig_bins_file) - logger.info("Final binning output file: " + output_path) - logger.info("Maximum number of iterations: " + str(max_iteration)) - logger.info("Difference threshold: " + str(diff_threshold)) + logger.info(f"Assembly graph file: {assembly_graph_file}") + logger.info(f"Existing binning output file: {contig_bins_file}") + logger.info(f"Final binning output file: {output_path}") + logger.info(f"Maximum number of iterations: {max_iteration}") + logger.info(f"Difference threshold: {diff_threshold}") logger.info("GraphBin started") # Get the number of bins from the initial binning result # -------------------------------------------------------- - n_bins = 0 - - try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=delimiter) - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: " + str(n_bins)) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the binning result file is provided and it is having the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Constructing the assembly graph") - - # Get the links from the .gfa file - # ----------------------------------- - - my_map = BidirectionalMap() - - node_count = 0 - - nodes = [] - - links = [] - - try: - # Get contig connections from .gfa file - with open(assembly_graph_file) as file: - - for line in file.readlines(): - line = line.strip() - - # Count the number of contigs - if line.startswith("S"): - strings = line.split("\t") - my_node = strings[1] - my_map[node_count] = my_node - nodes.append(my_node) - node_count += 1 + n_bins, bins_list = get_initial_bin_count(contig_bins_file, delimiter) - # Identify lines with link information - elif line.startswith("L"): + # Get assembly graph + # -------------------- - link = [] - strings = line.split("\t") - - if strings[1] != strings[3]: - start = strings[1] - end = strings[3] - link.append(start) - link.append(end) - links.append(link) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - contigs_map = my_map - contigs_map_rev = my_map.inverse - - logger.info("Total number of contigs available: " + str(node_count)) - - ## Construct the assembly graph - # ------------------------------- - - try: - - # Create the graph - assembly_graph = Graph() - - # Create list of edges - edge_list = [] - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"] = i - assembly_graph.vs[i]["label"] = str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Total number of edges in the assembly graph: " + str(len(edge_list))) + assembly_graph, contigs_map, node_count = parse_graph(assembly_graph_file) # Get initial binning result # ---------------------------- - logger.info("Obtaining the initial binning result") - - bins = [[] for x in range(n_bins)] - - try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=delimiter) - for row in readCSV: - contig_num = contigs_map_rev[row[0]] - - bin_num = bins_list.index(row[1]) - bins[bin_num].append(contig_num) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Remove labels of ambiguous vertices - # ------------------------------------- - - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - neighbours_have_same_label_list = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - # Further remove labels of ambiguous vertices - binned_contigs = [] - - for n in range(n_bins): - binned_contigs = sorted(binned_contigs + bins[n]) - - for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices( - assembly_graph, i, binned_contigs - ) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - if my_bin in remove_by_bin: - if ( - len(bins[my_bin]) - len(remove_by_bin[my_bin]) - >= MIN_BIN_COUNT - ): - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - logger.info("Obtaining refined binning result") - - # Get vertices which are not isolated and not in components without any labels - # ----------------------------------------------------------------------------- - - logger.info( - "Deteremining vertices which are not isolated and not in components without any labels" - ) - - non_isolated = [] - - for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length != len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - - logger.info("Number of non-isolated contigs: " + str(len(non_isolated))) - - # Run label propagation - # ----------------------- - - data = [] - - for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i + 1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - - # Check if initial binning result consists of contigs belonging to multiple bins - - multiple_bins = False - - for item in data: - if type(item[1]) is int and type(item[2]) is int: - multiple_bins = True - break - - if multiple_bins: - logger.error( - "Initial binning result consists of contigs belonging to multiple bins. Please make sure that each contig in the initial binning result belongs to only one bin." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Label propagation - - lp = LabelProp() - - lp.load_data_from_mem(data) - - logger.info( - "Starting label propagation with eps=" - + str(diff_threshold) - + " and max_iteration=" - + str(max_iteration) + bins = get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map.inverse, delimiter ) - ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) - - logger.info("Obtaining Label Propagation result") - - for l in ans: - for i in range(n_bins): - if l[1] == i + 1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - # Remove labels of ambiguous vertices + # Run GraphBin logic # ------------------------------------- - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) + final_bins, remove_labels, non_isolated = graphbin_main( + n_bins, + bins, + bins_list, + assembly_graph, + node_count, + diff_threshold, + max_iteration, + ) elapsed_time = time.time() - start_time - logger.info("Obtaining the Final Refined Binning result") - - final_bins = {} - - for i in range(n_bins): - for contig in bins[i]: - final_bins[contig] = bins_list[i] - # Print elapsed time for the process - logger.info("Elapsed time: " + str(elapsed_time) + " seconds") + logger.info(f"Elapsed time: {elapsed_time} seconds") # Write result to output file # ----------------------------- - logger.info("Writing the Final Binning result to file") - - output_bins = [] - - output_bins_path = output_path + prefix + "bins/" - output_file = output_path + prefix + "graphbin_output.csv" - - if not os.path.isdir(output_bins_path): - subprocess.run("mkdir -p " + output_bins_path, shell=True) - - bin_files = {} - - for bin_name in set(final_bins.values()): - bin_files[bin_name] = open( - output_bins_path + prefix + "bin_" + bin_name + ".fasta", "w+" - ) - - for label, seq in MinimalFastaParser( - contigs_file, label_to_name=lambda x: x.split()[0] - ): - - contig_num = contigs_map_rev[label] - - if contig_num in final_bins: - bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") - - # Close output files - for c in set(final_bins.values()): - bin_files[c].close() - - for b in range(len(bins)): - - for contig in bins[b]: - line = [] - line.append(str(contigs_map[contig])) - line.append(bins_list[b]) - output_bins.append(line) - - with open(output_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - for row in output_bins: - output_writer.writerow(row) - - logger.info("Final binning results can be found in " + str(output_bins_path)) - - unbinned_contigs = [] - - for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(str(contigs_map[i])) - unbinned_contigs.append(line) - - if len(unbinned_contigs) != 0: - unbinned_file = output_path + prefix + "graphbin_unbinned.csv" - - with open(unbinned_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at " + unbinned_file) - - # Exit program - # -------------- - - logger.info("Thank you for using GraphBin! Bye...!") - - logger.removeHandler(fileHandler) - logger.removeHandler(consoleHeader) + write_output( + output_path, + prefix, + final_bins, + contigs_file, + contigs_map.inverse, + bins, + contigs_map, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, + ) -def main(): - # Setup argument parser - # ----------------------- - ap = PARSER - args = ap.parse_args() +def main(args): run(args) diff --git a/src/graphbin/utils/graphbin_Flye.py b/src/graphbin/utils/graphbin_Flye.py index e406ec1..a631cc9 100755 --- a/src/graphbin/utils/graphbin_Flye.py +++ b/src/graphbin/utils/graphbin_Flye.py @@ -11,42 +11,32 @@ graphbin_Flye.py makes use of the assembly graphs produced by Flye long read assembler. """ -import csv import logging -import os -import subprocess -import sys import time -from cogent3.parse.fasta import MinimalFastaParser -from igraph import * +from graphbin.utils.graphbin_Func import graphbin_main +from graphbin.utils.parsers import get_initial_bin_count +from graphbin.utils.parsers.flye_parser import ( + get_initial_binning_result, + parse_graph, + write_output, +) -from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap -from graphbin.utils.graphbin_Func import getClosestLabelledVertices -from graphbin.utils.graphbin_Options import PARSER -from graphbin.utils.labelpropagation.labelprop import LabelProp __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" +# create logger +logger = logging.getLogger("GraphBin %s" % __version__) -def run(args): - # Setup logger - # ----------------------- - logger = logging.getLogger("GraphBin %s" % __version__) - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - consoleHeader = logging.StreamHandler() - consoleHeader.setFormatter(formatter) - consoleHeader.setLevel(logging.INFO) - logger.addHandler(consoleHeader) +def run(args): start_time = time.time() assembly_graph_file = args.graph @@ -58,15 +48,6 @@ def run(args): delimiter = args.delimiter max_iteration = args.max_iteration diff_threshold = args.diff_threshold - MIN_BIN_COUNT = 10 - - # Setup output path for log file - # --------------------------------------------------- - - fileHandler = logging.FileHandler(output_path + "/" + prefix + "graphbin.log") - fileHandler.setLevel(logging.DEBUG) - fileHandler.setFormatter(formatter) - logger.addHandler(fileHandler) logger.info( "Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs." @@ -75,631 +56,73 @@ def run(args): "This version of GraphBin makes use of the assembly graph produced by Flye which is a long reads assembler based on the de Bruijn graph approach." ) - logger.info("Assembly graph file: " + assembly_graph_file) - logger.info("Existing binning output file: " + contig_bins_file) - logger.info("Contig paths file: " + contig_paths) - logger.info("Final binning output file: " + output_path) - logger.info("Maximum number of iterations: " + str(max_iteration)) - logger.info("Difference threshold: " + str(diff_threshold)) + logger.info(f"Assembly graph file: {assembly_graph_file}") + logger.info(f"Existing binning output file: {contig_bins_file}") + logger.info(f"Contig paths file: {contig_paths}") + logger.info(f"Final binning output file: {output_path}") + logger.info(f"Maximum number of iterations: {max_iteration}") + logger.info(f"Difference threshold: {diff_threshold}") logger.info("GraphBin started") # Get the number of bins from the initial binning result # -------------------------------------------------------- - n_bins = 0 - - try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=delimiter) - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: " + str(n_bins)) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the binning result file is provided and it is having the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Constructing the assembly graph") - - # Get contig names - # ----------------------------------- - - contig_names = BidirectionalMap() - - contig_num = 0 - - with open(contig_paths, "r") as file: - - for line in file.readlines(): - - if not line.startswith("#"): - name = line.strip().split()[0] - contig_names[contig_num] = name - contig_num += 1 - - contig_names_rev = contig_names.inverse - - # Get the paths and edges - # ----------------------------------- - - paths = {} - segment_contigs = {} - - try: - - with open(contig_paths) as file: - - for line in file.readlines(): - - if not line.startswith("#"): - - strings = line.strip().split() - - contig_name = strings[0] - - path = strings[-1] - path = path.replace("*", "") - - if path.startswith(","): - path = path[1:] - - if path.endswith(","): - path = path[:-1] - - segments = path.rstrip().split(",") - - contig_num = contig_names_rev[contig_name] - - if contig_num not in paths: - paths[contig_num] = segments - - for segment in segments: - - if segment != "": - - if segment not in segment_contigs: - segment_contigs[segment] = set([contig_num]) - else: - segment_contigs[segment].add(contig_num) - - links_map = defaultdict(set) - - # Get links from assembly_graph.gfa - with open(assembly_graph_file) as file: - - for line in file.readlines(): - line = line.strip() - - # Identify lines with link information - if line.startswith("L"): - strings = line.split("\t") - - f1, f2 = "", "" - - if strings[2] == "+": - f1 = strings[1][5:] - if strings[2] == "-": - f1 = "-" + strings[1][5:] - if strings[4] == "+": - f2 = strings[3][5:] - if strings[4] == "-": - f2 = "-" + strings[3][5:] - - links_map[f1].add(f2) - links_map[f2].add(f1) - - # Create list of edges - edge_list = [] - - for i in paths: - segments = paths[i] - - new_links = [] - - for segment in segments: - - my_segment = segment - my_segment_num = "" + n_bins, bins_list = get_initial_bin_count(contig_bins_file, delimiter) - my_segment_rev = "" + # Get assembly graph + # -------------------- - if my_segment.startswith("-"): - my_segment_rev = my_segment[1:] - my_segment_num = my_segment[1:] - else: - my_segment_rev = "-" + my_segment - my_segment_num = my_segment - - if my_segment in links_map: - new_links.extend(list(links_map[my_segment])) - - if my_segment_rev in links_map: - new_links.extend(list(links_map[my_segment_rev])) - - if my_segment in segment_contigs: - for contig in segment_contigs[my_segment]: - if i != contig: - # Add edge to list of edges - edge_list.append((i, contig)) - - if my_segment_rev in segment_contigs: - for contig in segment_contigs[my_segment_rev]: - if i != contig: - # Add edge to list of edges - edge_list.append((i, contig)) - - if my_segment_num in segment_contigs: - for contig in segment_contigs[my_segment_num]: - if i != contig: - # Add edge to list of edges - edge_list.append((i, contig)) - - for new_link in new_links: - if new_link in segment_contigs: - for contig in segment_contigs[new_link]: - if i != contig: - # Add edge to list of edges - edge_list.append((i, contig)) - - if new_link.startswith("-"): - if new_link[1:] in segment_contigs: - for contig in segment_contigs[new_link[1:]]: - if i != contig: - # Add edge to list of edges - edge_list.append((i, contig)) - - node_count = len(contig_names_rev) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Total number of contigs available: " + str(node_count)) - - ## Construct the assembly graph - # ------------------------------- - - try: - - # Create the graph - assembly_graph = Graph() - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"] = i - assembly_graph.vs[i]["label"] = str(contig_names[i]) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Total number of edges in the assembly graph: " + str(len(edge_list))) + assembly_graph, contig_names, node_count = parse_graph( + assembly_graph_file, contig_paths + ) # Get initial binning result # ---------------------------- - logger.info("Obtaining the initial binning result") - - bins = [[] for x in range(n_bins)] - - try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=delimiter) - for row in readCSV: - contig_num = contig_names_rev[row[0]] - - bin_num = bins_list.index(row[1]) - bins[bin_num].append(contig_num) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Remove labels of ambiguous vertices - # ------------------------------------- - - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - neighbours_have_same_label_list = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - # Further remove labels of ambiguous vertices - binned_contigs = [] - - for n in range(n_bins): - binned_contigs = sorted(binned_contigs + bins[n]) - - for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices( - assembly_graph, i, binned_contigs - ) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - if my_bin in remove_by_bin: - if ( - len(bins[my_bin]) - len(remove_by_bin[my_bin]) - >= MIN_BIN_COUNT - ): - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - logger.info("Obtaining refined binning result") - - # Get vertices which are not isolated and not in components without any labels - # ----------------------------------------------------------------------------- - - logger.info( - "Deteremining vertices which are not isolated and not in components without any labels" - ) - - non_isolated = [] - - for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length != len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - - logger.info("Number of non-isolated contigs: " + str(len(non_isolated))) - - # Run label propagation - # ----------------------- - - data = [] - - for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i + 1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - - # Check if initial binning result consists of contigs belonging to multiple bins - - multiple_bins = False - - for item in data: - if type(item[1]) is int and type(item[2]) is int: - multiple_bins = True - break - - if multiple_bins: - logger.error( - "Initial binning result consists of contigs belonging to multiple bins. Please make sure that each contig in the initial binning result belongs to only one bin." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Label propagation - - lp = LabelProp() - - lp.load_data_from_mem(data) - - logger.info( - "Starting label propagation with eps=" - + str(diff_threshold) - + " and max_iteration=" - + str(max_iteration) + bins = get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contig_names.inverse, delimiter ) - ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) - - logger.info("Obtaining Label Propagation result") - - for l in ans: - for i in range(n_bins): - if l[1] == i + 1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - # Remove labels of ambiguous vertices + # Run GraphBin logic # ------------------------------------- - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) + final_bins, remove_labels, non_isolated = graphbin_main( + n_bins, + bins, + bins_list, + assembly_graph, + node_count, + diff_threshold, + max_iteration, + ) elapsed_time = time.time() - start_time - logger.info("Obtaining the Final Refined Binning result") - - final_bins = {} - - for i in range(n_bins): - for contig in bins[i]: - final_bins[contig] = bins_list[i] - # Print elapsed time for the process - logger.info("Elapsed time: " + str(elapsed_time) + " seconds") + logger.info(f"Elapsed time: {elapsed_time} seconds") # Write result to output file # ----------------------------- + write_output( + output_path, + prefix, + final_bins, + contigs_file, + contig_names.inverse, + bins, + contig_names, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, + ) logger.info("Writing the Final Binning result to file") - output_bins = [] - - output_bins_path = output_path + prefix + "bins/" - output_file = output_path + prefix + "graphbin_output.csv" - - if not os.path.isdir(output_bins_path): - subprocess.run("mkdir -p " + output_bins_path, shell=True) - - bin_files = {} - - for bin_name in set(final_bins.values()): - bin_files[bin_name] = open( - output_bins_path + prefix + "bin_" + bin_name + ".fasta", "w+" - ) - - for label, seq in MinimalFastaParser(contigs_file): - - contig_num = contig_names_rev[label] - - if contig_num in final_bins: - bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") - - # Close output files - for c in set(final_bins.values()): - bin_files[c].close() - - for b in range(len(bins)): - - # with open(output_bins_path + "bin_" + str(b+1) + "_ids.txt", "w") as bin_file: - for contig in bins[b]: - line = [] - line.append(str(contig_names[contig])) - line.append(bins_list[b]) - output_bins.append(line) - # bin_file.write(str(contig_names[contig])+"\n") - - # subprocess.run("awk -F'>' 'NR==FNR{ids[$0]; next} NF>1{f=($2 in ids)} f' " + output_bins_path + "bin_" + str(b+1) + "_ids.txt " + contigs_file + " > " + output_bins_path + "bin_" +str(b+1) +"_seqs.fasta", shell=True) - - with open(output_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - for row in output_bins: - output_writer.writerow(row) - - logger.info("Final binning results can be found in " + str(output_bins_path)) - - unbinned_contigs = [] - - for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(str(contig_names[i])) - unbinned_contigs.append(line) - - if len(unbinned_contigs) != 0: - unbinned_file = output_path + prefix + "graphbin_unbinned.csv" - - with open(unbinned_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at " + unbinned_file) - - # Exit program - # -------------- - - logger.info("Thank you for using GraphBin! Bye...!") - - logger.removeHandler(fileHandler) - logger.removeHandler(consoleHeader) - -def main(): - # Setup argument parser - # ----------------------- - ap = PARSER - args = ap.parse_args() +def main(args): run(args) diff --git a/src/graphbin/utils/graphbin_Func.py b/src/graphbin/utils/graphbin_Func.py index 8d9fbf0..9b48d94 100644 --- a/src/graphbin/utils/graphbin_Func.py +++ b/src/graphbin/utils/graphbin_Func.py @@ -1,14 +1,24 @@ #!/usr/bin/env python3 +import logging +import sys + +from graphbin.utils.labelpropagation.labelprop import LabelProp + + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" +logger = logging.getLogger("GraphBin %s" % __version__) + +MIN_BIN_COUNT = 10 + def getClosestLabelledVertices(graph, node, binned_contigs): # Remove labels of ambiguous vertices @@ -42,3 +52,275 @@ def getClosestLabelledVertices(graph, node, binned_contigs): if len(temp2) > 0: queu_l.append(temp2) return labelled + + +def graphbin_main( + n_bins, bins, bins_list, assembly_graph, node_count, diff_threshold, max_iteration +): + logger.info("Determining ambiguous vertices") + + remove_by_bin = {} + + remove_labels = [] + + neighbours_have_same_label_list = [] + + for b in range(n_bins): + for i in bins[b]: + my_bin = b + + # Get set of closest labelled vertices with distance = 1 + closest_neighbours = assembly_graph.neighbors(i, mode="all") + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + neighbours_binned = False + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + neighbours_binned = True + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + if my_bin in remove_by_bin: + if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: + remove_labels.append(i) + remove_by_bin[my_bin].append(i) + else: + if len(bins[my_bin]) >= MIN_BIN_COUNT: + remove_labels.append(i) + remove_by_bin[my_bin] = [i] + + elif neighbours_binned: + neighbours_have_same_label_list.append(i) + + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + # Further remove labels of ambiguous vertices + binned_contigs = [] + + for n in range(n_bins): + binned_contigs = sorted(binned_contigs + bins[n]) + + for b in range(n_bins): + for i in bins[b]: + if i not in neighbours_have_same_label_list: + my_bin = b + + # Get set of closest labelled vertices + closest_neighbours = getClosestLabelledVertices( + assembly_graph, i, binned_contigs + ) + + if len(closest_neighbours) > 0: + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label and i not in remove_labels: + if my_bin in remove_by_bin: + if ( + len(bins[my_bin]) - len(remove_by_bin[my_bin]) + >= MIN_BIN_COUNT + ): + remove_labels.append(i) + remove_by_bin[my_bin].append(i) + else: + if len(bins[my_bin]) >= MIN_BIN_COUNT: + remove_labels.append(i) + remove_by_bin[my_bin] = [i] + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining the refined binning result") + + # Get vertices which are not isolated and not in components without any labels + # ----------------------------------------------------------------------------- + + logger.info( + "Deteremining vertices which are not isolated and not in components without any labels" + ) + + non_isolated = [] + + for i in range(node_count): + if i not in non_isolated and i in binned_contigs: + component = [] + component.append(i) + length = len(component) + neighbours = assembly_graph.neighbors(i, mode="all") + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + component = list(set(component)) + + while length != len(component): + length = len(component) + + for j in component: + neighbours = assembly_graph.neighbors(j, mode="all") + + for neighbor in neighbours: + if neighbor not in component: + component.append(neighbor) + + labelled = False + for j in component: + if j in binned_contigs: + labelled = True + break + + if labelled: + for j in component: + if j not in non_isolated: + non_isolated.append(j) + + logger.info("Number of non-isolated contigs: " + str(len(non_isolated))) + + # Run label propagation + # ----------------------- + + data = [] + + for contig in range(node_count): + # Consider vertices that are not isolated + + if contig in non_isolated: + line = [] + line.append(contig) + + assigned = False + + for i in range(n_bins): + if contig in bins[i]: + line.append(i + 1) + assigned = True + + if not assigned: + line.append(0) + + neighbours = assembly_graph.neighbors(contig, mode="all") + + neighs = [] + + for neighbour in neighbours: + n = [] + n.append(neighbour) + n.append(1.0) + neighs.append(n) + + line.append(neighs) + + data.append(line) + + # Check if initial binning result consists of contigs belonging to multiple bins + + multiple_bins = False + + for item in data: + if type(item[1]) is int and type(item[2]) is int: + multiple_bins = True + break + + if multiple_bins: + logger.error( + "Initial binning result consists of contigs belonging to multiple bins. Please make sure that each contig in the initial binning result belongs to only one bin." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + # Label propagation + + lp = LabelProp() + + lp.load_data_from_mem(data) + + logger.info( + "Starting label propagation with eps=" + + str(diff_threshold) + + " and max_iteration=" + + str(max_iteration) + ) + + ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) + + logger.info("Obtaining Label Propagation result") + + for l in ans: + for i in range(n_bins): + if l[1] == i + 1 and l[0] not in bins[i]: + bins[i].append(l[0]) + + # Remove labels of ambiguous vertices + # ------------------------------------- + + logger.info("Determining ambiguous vertices") + + remove_by_bin = {} + + remove_labels = [] + + for b in range(n_bins): + for i in bins[b]: + my_bin = b + + closest_neighbours = assembly_graph.neighbors(i, mode="all") + + # Determine whether all the closest labelled vertices have the same label as its own + neighbours_have_same_label = True + + for neighbour in closest_neighbours: + for k in range(n_bins): + if neighbour in bins[k]: + if k != my_bin: + neighbours_have_same_label = False + break + + if not neighbours_have_same_label: + if my_bin in remove_by_bin: + if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: + remove_labels.append(i) + remove_by_bin[my_bin].append(i) + else: + if len(bins[my_bin]) >= MIN_BIN_COUNT: + remove_labels.append(i) + remove_by_bin[my_bin] = [i] + + logger.info("Removing labels of ambiguous vertices") + + # Remove labels of ambiguous vertices + for i in remove_labels: + for n in range(n_bins): + if i in bins[n]: + bins[n].remove(i) + + logger.info("Obtaining the Final Refined Binning result") + + final_bins = {} + + for i in range(n_bins): + for contig in bins[i]: + final_bins[contig] = bins_list[i] + + return final_bins, remove_labels, non_isolated diff --git a/src/graphbin/utils/graphbin_MEGAHIT.py b/src/graphbin/utils/graphbin_MEGAHIT.py index f4afd9a..2fce7aa 100755 --- a/src/graphbin/utils/graphbin_MEGAHIT.py +++ b/src/graphbin/utils/graphbin_MEGAHIT.py @@ -11,44 +11,33 @@ graphbin_MEGAHIT.py makes use of the assembly graphs produced by MEGAHIT assembler. """ -import csv import logging -import os -import re -import subprocess -import sys import time -from cogent3.parse.fasta import MinimalFastaParser -from igraph import * +from graphbin.utils.graphbin_Func import graphbin_main +from graphbin.utils.parsers import get_initial_bin_count +from graphbin.utils.parsers.megahit_parser import ( + get_contig_descriptors, + get_initial_binning_result, + parse_graph, + write_output, +) -from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap -from graphbin.utils.graphbin_Func import getClosestLabelledVertices -from graphbin.utils.graphbin_Options import PARSER -from graphbin.utils.labelpropagation.labelprop import LabelProp __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" +# create logger +logger = logging.getLogger("GraphBin %s" % __version__) -def run(args): - # Setup logger - # ----------------------- - - logger = logging.getLogger("GraphBin %s" % __version__) - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - consoleHeader = logging.StreamHandler() - consoleHeader.setFormatter(formatter) - consoleHeader.setLevel(logging.INFO) - logger.addHandler(consoleHeader) +def run(args): start_time = time.time() assembly_graph_file = args.graph @@ -61,14 +50,6 @@ def run(args): diff_threshold = args.diff_threshold MIN_BIN_COUNT = 10 - # Setup output path for log file - # --------------------------------------------------- - - fileHandler = logging.FileHandler(output_path + "/" + prefix + "graphbin.log") - fileHandler.setLevel(logging.DEBUG) - fileHandler.setFormatter(formatter) - logger.addHandler(fileHandler) - logger.info( "Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs." ) @@ -76,560 +57,81 @@ def run(args): "This version of GraphBin makes use of the assembly graph produced by MEGAHIT which is based on the de Bruijn graph approach." ) - logger.info("Assembly graph file: " + assembly_graph_file) - logger.info("Existing binning output file: " + contig_bins_file) - logger.info("Final binning output file: " + output_path) - logger.info("Maximum number of iterations: " + str(max_iteration)) - logger.info("Difference threshold: " + str(diff_threshold)) + logger.info(f"Assembly graph file: {assembly_graph_file}") + logger.info(f"Existing binning output file: {contig_bins_file}") + logger.info(f"Final binning output file: {output_path}") + logger.info(f"Maximum number of iterations: {max_iteration}") + logger.info(f"Difference threshold: {diff_threshold}") logger.info("GraphBin started") # Get the number of bins from the initial binning result # -------------------------------------------------------- - n_bins = 0 - - try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=delimiter) - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info( - "Number of bins available in the initial binning result: " + str(n_bins) - ) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the initial binning result file is provided and it is having the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Constructing the assembly graph") + n_bins, bins_list = get_initial_bin_count(contig_bins_file, delimiter) # Get original contig IDs # ------------------------------- - original_contigs = {} - contig_descriptions = {} - for label, seq in MinimalFastaParser(contigs_file): - name = label.split()[0] - original_contigs[name] = seq - contig_descriptions[name] = label - - # Construct the assembly graph - # ------------------------------- - - node_count = 0 - - graph_contigs = {} - - links = [] - - my_map = BidirectionalMap() - - try: - - # Get links from .gfa file - with open(assembly_graph_file) as file: - - for line in file.readlines(): - line = line.strip() - - # Identify lines with link information - if line.startswith("L"): - link = [] - - strings = line.split("\t") - - start_1 = "NODE_" - end_1 = "_length" - - link1 = int( - re.search("%s(.*)%s" % (start_1, end_1), strings[1]).group(1) - ) - - start_2 = "NODE_" - end_2 = "_length" - - link2 = int( - re.search("%s(.*)%s" % (start_2, end_2), strings[3]).group(1) - ) - - link.append(link1) - link.append(link2) - links.append(link) - - elif line.startswith("S"): - strings = line.split() - - start = "NODE_" - end = "_length" - - contig_num = int( - re.search("%s(.*)%s" % (start, end), strings[1]).group(1) - ) - - my_map[node_count] = int(contig_num) - - graph_contigs[contig_num] = strings[2] - - node_count += 1 - - logger.info("Total number of contigs available: " + str(node_count)) - - contigs_map = my_map - contigs_map_rev = my_map.inverse - - # Create graph - assembly_graph = Graph() - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Create list of edges - edge_list = [] - - for i in range(node_count): - assembly_graph.vs[i]["id"] = i - assembly_graph.vs[i]["label"] = str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + original_contigs = get_contig_descriptors(contigs_file) - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) + # Get assembly graph + # -------------------- - logger.info("Total number of edges in the assembly graph: " + str(len(edge_list))) - - # Map original contig IDs to contig IDS of assembly graph - # -------------------------------------------------------- - - graph_to_contig_map = BidirectionalMap() - - for (n, m), (n2, m2) in zip(graph_contigs.items(), original_contigs.items()): - if m == m2: - graph_to_contig_map[n] = n2 - - graph_to_contig_map_rev = graph_to_contig_map.inverse + assembly_graph, graph_to_contig_map, contigs_map, node_count = parse_graph( + assembly_graph_file, original_contigs + ) # Get initial binning result # ---------------------------- - logger.info("Obtaining the initial binning result") - - bins = [[] for x in range(n_bins)] - - try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=delimiter) - for row in readCSV: - contig_num = contigs_map_rev[int(graph_to_contig_map_rev[row[0]])] - - bin_num = bins_list.index(row[1]) - bins[bin_num].append(contig_num) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Remove labels of ambiguous vertices - # ------------------------------------- - - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - neighbours_have_same_label_list = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - # Further remove labels of ambiguous vertices - binned_contigs = [] - - for n in range(n_bins): - binned_contigs = sorted(binned_contigs + bins[n]) - - for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices( - assembly_graph, i, binned_contigs - ) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - if my_bin in remove_by_bin: - if ( - len(bins[my_bin]) - len(remove_by_bin[my_bin]) - >= MIN_BIN_COUNT - ): - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - logger.info("Obtaining refined binning result") - - # Get vertices which are not isolated and not in components without any labels - # ----------------------------------------------------------------------------- - - logger.info( - "Deteremining vertices which are not isolated and not in components without any labels" + bins = get_initial_binning_result( + n_bins, + bins_list, + contig_bins_file, + contigs_map.inverse, + graph_to_contig_map.inverse, + delimiter, ) - non_isolated = [] - - for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length != len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - - logger.info("Number of non-isolated contigs: " + str(len(non_isolated))) - - # Run label propagation - # ----------------------- - - data = [] - - for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i + 1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - - # Check if initial binning result consists of contigs belonging to multiple bins - - multiple_bins = False - - for item in data: - if type(item[1]) is int and type(item[2]) is int: - multiple_bins = True - break - - if multiple_bins: - logger.error( - "Initial binning result consists of contigs belonging to multiple bins. Please make sure that each contig in the initial binning result belongs to only one bin." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Label propagation - - lp = LabelProp() - - lp.load_data_from_mem(data) - - logger.info( - "Starting label propagation with eps=" - + str(diff_threshold) - + " and max_iteration=" - + str(max_iteration) - ) - - ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) - - logger.info("Obtaining Label Propagation result") - - for l in ans: - for i in range(n_bins): - if l[1] == i + 1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - # Remove labels of ambiguous vertices + # Run GraphBin logic # ------------------------------------- - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) + final_bins, remove_labels, non_isolated = graphbin_main( + n_bins, + bins, + bins_list, + assembly_graph, + node_count, + diff_threshold, + max_iteration, + ) elapsed_time = time.time() - start_time - logger.info("Obtaining the Final Refined Binning result") - - final_bins = {} - - for i in range(n_bins): - for contig in bins[i]: - final_bins[contig] = bins_list[i] - # Print elapsed time for the process - logger.info("Elapsed time: " + str(elapsed_time) + " seconds") + logger.info(f"Elapsed time: {elapsed_time} seconds") # Write result to output file # ----------------------------- - logger.info("Writing the Final Binning result to file") - - output_bins = [] - - output_bins_path = output_path + prefix + "bins/" - output_file = output_path + prefix + "graphbin_output.csv" - - if not os.path.isdir(output_bins_path): - subprocess.run("mkdir -p " + output_bins_path, shell=True) - - bin_files = {} - - for bin_name in set(final_bins.values()): - bin_files[bin_name] = open( - output_bins_path + prefix + "bin_" + bin_name + ".fasta", "w+" - ) - - for label, seq in MinimalFastaParser( - contigs_file, label_to_name=lambda x: x.split()[0] - ): - - contig_num = contigs_map_rev[graph_to_contig_map_rev[label]] - - if contig_num in final_bins: - bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") - - # Close output files - for c in set(final_bins.values()): - bin_files[c].close() - - for b in range(len(bins)): - - for contig in bins[b]: - line = [] - line.append(graph_to_contig_map[contigs_map[contig]]) - line.append(bins_list[b]) - output_bins.append(line) - - with open(output_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - for row in output_bins: - output_writer.writerow(row) - - logger.info("Final binning results can be found in " + str(output_bins_path)) - - unbinned_contigs = [] - - for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(graph_to_contig_map[contigs_map[i]]) - unbinned_contigs.append(line) - - if len(unbinned_contigs) != 0: - unbinned_file = output_path + prefix + "graphbin_unbinned.csv" - - with open(unbinned_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at " + unbinned_file) - - # Exit program - # -------------- - - logger.info("Thank you for using GraphBin! Bye...!") - - logger.removeHandler(fileHandler) - logger.removeHandler(consoleHeader) + write_output( + output_path, + prefix, + final_bins, + contigs_file, + graph_to_contig_map, + bins, + contigs_map, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, + ) -def main(): - # Setup argument parser - # ----------------------- - ap = PARSER - args = ap.parse_args() +def main(args): run(args) diff --git a/src/graphbin/utils/graphbin_Miniasm.py b/src/graphbin/utils/graphbin_Miniasm.py index 32510b8..1302b2a 100755 --- a/src/graphbin/utils/graphbin_Miniasm.py +++ b/src/graphbin/utils/graphbin_Miniasm.py @@ -11,43 +11,32 @@ graphbin_Miniasm.py makes use of the assembly graphs produced by Miniasm long read assembler. """ -import csv import logging -import os -import subprocess -import sys import time -from cogent3.parse.fasta import MinimalFastaParser -from igraph import * +from graphbin.utils.graphbin_Func import graphbin_main +from graphbin.utils.parsers import get_initial_bin_count +from graphbin.utils.parsers.miniasm_parser import ( + get_initial_binning_result, + parse_graph, + write_output, +) -from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap -from graphbin.utils.graphbin_Func import getClosestLabelledVertices -from graphbin.utils.graphbin_Options import PARSER -from graphbin.utils.labelpropagation.labelprop import LabelProp __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" +# create logger +logger = logging.getLogger("GraphBin %s" % __version__) -def run(args): - # Setup logger - # ----------------------- - - logger = logging.getLogger("GraphBin %s" % __version__) - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - consoleHeader = logging.StreamHandler() - consoleHeader.setFormatter(formatter) - consoleHeader.setLevel(logging.INFO) - logger.addHandler(consoleHeader) +def run(args): start_time = time.time() assembly_graph_file = args.graph @@ -60,14 +49,6 @@ def run(args): diff_threshold = args.diff_threshold MIN_BIN_COUNT = 10 - # Setup output path for log file - # --------------------------------------------------- - - fileHandler = logging.FileHandler(output_path + "/" + prefix + "graphbin.log") - fileHandler.setLevel(logging.INFO) - fileHandler.setFormatter(formatter) - logger.addHandler(fileHandler) - logger.info( "Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs." ) @@ -75,529 +56,68 @@ def run(args): "This version of GraphBin makes use of the assembly graph produced by Miniasm." ) - logger.info("Assembly graph file: " + assembly_graph_file) - logger.info("Existing binning output file: " + contig_bins_file) - logger.info("Final binning output file: " + output_path) - logger.info("Maximum number of iterations: " + str(max_iteration)) - logger.info("Difference threshold: " + str(diff_threshold)) + logger.info(f"Assembly graph file: {assembly_graph_file}") + logger.info(f"Existing binning output file: {contig_bins_file}") + logger.info(f"Final binning output file: {output_path}") + logger.info(f"Maximum number of iterations: {max_iteration}") + logger.info(f"Difference threshold: {diff_threshold}") logger.info("GraphBin started") # Get the number of bins from the initial binning result # -------------------------------------------------------- - n_bins = 0 - - try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=delimiter) - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: " + str(n_bins)) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the binning result file is provided and it is having the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Constructing the assembly graph") - - # Get the links from the .gfa file - # ----------------------------------- - - my_map = BidirectionalMap() - - node_count = 0 - - nodes = [] - - links = [] - - try: - # Get contig connections from .gfa file - with open(assembly_graph_file) as file: - - for line in file.readlines(): - line = line.strip() - - # Count the number of contigs - if line.startswith("S"): - strings = line.split("\t") - my_node = strings[1] - my_map[node_count] = my_node - nodes.append(my_node) - node_count += 1 + n_bins, bins_list = get_initial_bin_count(contig_bins_file, delimiter) - # Identify lines with link information - elif line.startswith("L"): + # Get assembly graph + # -------------------- - link = [] - strings = line.split("\t") - - if strings[1] != strings[3]: - start = strings[1] - end = strings[3] - link.append(start) - link.append(end) - links.append(link) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - contigs_map = my_map - contigs_map_rev = my_map.inverse - - logger.info("Total number of contigs available: " + str(node_count)) - - ## Construct the assembly graph - # ------------------------------- - - try: - - # Create the graph - assembly_graph = Graph() - - # Create list of edges - edge_list = [] - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"] = i - assembly_graph.vs[i]["label"] = str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Total number of edges in the assembly graph: " + str(len(edge_list))) + assembly_graph, contigs_map, node_count = parse_graph(assembly_graph_file) # Get initial binning result # ---------------------------- - logger.info("Obtaining the initial binning result") - - bins = [[] for x in range(n_bins)] - - try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=delimiter) - for row in readCSV: - contig_num = contigs_map_rev[row[0]] - - bin_num = bins_list.index(row[1]) - bins[bin_num].append(contig_num) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Remove labels of ambiguous vertices - # ------------------------------------- - - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - neighbours_have_same_label_list = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - # Further remove labels of ambiguous vertices - binned_contigs = [] - - for n in range(n_bins): - binned_contigs = sorted(binned_contigs + bins[n]) - - for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices( - assembly_graph, i, binned_contigs - ) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - if my_bin in remove_by_bin: - if ( - len(bins[my_bin]) - len(remove_by_bin[my_bin]) - >= MIN_BIN_COUNT - ): - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - logger.info("Obtaining refined binning result") - - # Get vertices which are not isolated and not in components without any labels - # ----------------------------------------------------------------------------- - - logger.info( - "Deteremining vertices which are not isolated and not in components without any labels" - ) - - non_isolated = [] - - for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length != len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - - logger.info("Number of non-isolated contigs: " + str(len(non_isolated))) - - # Run label propagation - # ----------------------- - - data = [] - - for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i + 1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - - # Check if initial binning result consists of contigs belonging to multiple bins - - multiple_bins = False - - for item in data: - if type(item[1]) is int and type(item[2]) is int: - multiple_bins = True - break - - if multiple_bins: - logger.error( - "Initial binning result consists of contigs belonging to multiple bins. Please make sure that each contig in the initial binning result belongs to only one bin." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Label propagation - - lp = LabelProp() - - lp.load_data_from_mem(data) - - logger.info( - "Starting label propagation with eps=" - + str(diff_threshold) - + " and max_iteration=" - + str(max_iteration) + bins = get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map.inverse, delimiter ) - ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) - - logger.info("Obtaining Label Propagation result") - - for l in ans: - for i in range(n_bins): - if l[1] == i + 1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - # Remove labels of ambiguous vertices + # Run GraphBin logic # ------------------------------------- - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) + final_bins, remove_labels, non_isolated = graphbin_main( + n_bins, + bins, + bins_list, + assembly_graph, + node_count, + diff_threshold, + max_iteration, + ) elapsed_time = time.time() - start_time - logger.info("Obtaining the Final Refined Binning result") - - final_bins = {} - - for i in range(n_bins): - for contig in bins[i]: - final_bins[contig] = bins_list[i] - # Print elapsed time for the process - logger.info("Elapsed time: " + str(elapsed_time) + " seconds") + logger.info(f"Elapsed time: {elapsed_time} seconds") # Write result to output file # ----------------------------- - logger.info("Writing the Final Binning result to file") - - output_bins = [] - - output_bins_path = output_path + prefix + "bins/" - output_file = output_path + prefix + "graphbin_output.csv" - - if not os.path.isdir(output_bins_path): - subprocess.run("mkdir -p " + output_bins_path, shell=True) - - bin_files = {} - - for bin_name in set(final_bins.values()): - bin_files[bin_name] = open( - output_bins_path + prefix + "bin_" + bin_name + ".fasta", "w+" - ) - - for label, seq in MinimalFastaParser(contigs_file): - - contig_num = contigs_map_rev[label] - - if contig_num in final_bins: - bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") - - # Close output files - for c in set(final_bins.values()): - bin_files[c].close() - - for b in range(len(bins)): - - for contig in bins[b]: - line = [] - line.append(str(contigs_map[contig])) - line.append(bins_list[b]) - output_bins.append(line) - - with open(output_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - for row in output_bins: - output_writer.writerow(row) - - logger.info("Final binning results can be found in " + str(output_bins_path)) - - unbinned_contigs = [] - - for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(str(contigs_map[i])) - unbinned_contigs.append(line) - - if len(unbinned_contigs) != 0: - unbinned_file = output_path + prefix + "graphbin_unbinned.csv" - - with open(unbinned_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at " + unbinned_file) - - # Exit program - # -------------- - - logger.info("Thank you for using GraphBin! Bye...!") - - logger.removeHandler(fileHandler) - logger.removeHandler(consoleHeader) + write_output( + output_path, + prefix, + final_bins, + contigs_file, + contigs_map, + bins, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, + ) -def main(): - # Setup argument parser - # ----------------------- - ap = PARSER - args = ap.parse_args() +def main(args): run(args) diff --git a/src/graphbin/utils/graphbin_Options.py b/src/graphbin/utils/graphbin_Options.py deleted file mode 100644 index d493566..0000000 --- a/src/graphbin/utils/graphbin_Options.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -import argparse - -__author__ = "Vijini Mallawaarachchi" -__copyright__ = "Copyright 2019-2022, GraphBin Project" -__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] -__license__ = "BSD-3" -__version__ = "1.6.0" -__maintainer__ = "Vijini Mallawaarachchi" -__email__ = "vijini.mallawaarachchi@anu.edu.au" -__status__ = "Production" - - -PARSER = argparse.ArgumentParser( - description="""GraphBin Help. GraphBin is a metagenomic contig binning tool - that makes use of the contig connectivity information from the assembly graph to bin contigs. It utilizes the - binning result of an existing binning tool and a label propagation algorithm to correct mis-binned contigs - and predict the labels of contigs which are discarded due to short length.""", - prog="graphbin", -) - -PARSER.add_argument("--version", default=False, action="store_true") - -PARSER.add_argument("--graph", type=str, help="path to the assembly graph file") - -PARSER.add_argument( - "--binned", - type=str, - help="path to the .csv file with the initial binning output from an existing tool", -) - -PARSER.add_argument("--output", type=str, help="path to the output folder") - -PARSER.add_argument("--prefix", type=str, default="", help="prefix for the output file") - -PARSER.add_argument( - "--max_iteration", - type=int, - default=100, - help="maximum number of iterations for label propagation algorithm. [default: 100]", -) - -PARSER.add_argument( - "--diff_threshold", - type=float, - default=0.1, - help="difference threshold for label propagation algorithm. [default: 0.1]", -) diff --git a/src/graphbin/utils/graphbin_SGA.py b/src/graphbin/utils/graphbin_SGA.py index bcb51df..b43608e 100755 --- a/src/graphbin/utils/graphbin_SGA.py +++ b/src/graphbin/utils/graphbin_SGA.py @@ -11,44 +11,33 @@ graphbin_SGA.py makes use of the assembly graphs produced by SGA (String Graph Assembler). """ -import csv import logging -import os -import re -import subprocess -import sys import time -from cogent3.parse.fasta import MinimalFastaParser -from igraph import * +from graphbin.utils.graphbin_Func import graphbin_main +from graphbin.utils.parsers import get_initial_bin_count +from graphbin.utils.parsers.sga_parser import ( + get_contig_descriptions, + get_initial_binning_result, + parse_graph, + write_output, +) -from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap -from graphbin.utils.graphbin_Func import getClosestLabelledVertices -from graphbin.utils.graphbin_Options import PARSER -from graphbin.utils.labelpropagation.labelprop import LabelProp __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" +# create logger +logger = logging.getLogger("GraphBin %s" % __version__) -def run(args): - # Setup logger - # ----------------------- - - logger = logging.getLogger("GraphBin %s" % __version__) - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - consoleHeader = logging.StreamHandler() - consoleHeader.setFormatter(formatter) - consoleHeader.setLevel(logging.INFO) - logger.addHandler(consoleHeader) +def run(args): start_time = time.time() assembly_graph_file = args.graph @@ -59,15 +48,6 @@ def run(args): delimiter = args.delimiter max_iteration = args.max_iteration diff_threshold = args.diff_threshold - MIN_BIN_COUNT = 10 - - # Setup output path for log file - # --------------------------------------------------- - - fileHandler = logging.FileHandler(output_path + "/" + prefix + "graphbin.log") - fileHandler.setLevel(logging.DEBUG) - fileHandler.setFormatter(formatter) - logger.addHandler(fileHandler) logger.info( "Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs." @@ -76,542 +56,74 @@ def run(args): "This version of GraphBin makes use of the assembly graph produced by SGA which is based on the OLC (more recent string graph) approach." ) - logger.info("Assembly graph file: " + assembly_graph_file) - logger.info("Existing binning output file: " + contig_bins_file) - logger.info("Final binning output file: " + output_path) - logger.info("Maximum number of iterations: " + str(max_iteration)) - logger.info("Difference threshold: " + str(diff_threshold)) + logger.info(f"Assembly graph file: {assembly_graph_file}") + logger.info(f"Existing binning output file: {contig_bins_file}") + logger.info(f"Final binning output file: {output_path}") + logger.info(f"Maximum number of iterations: {max_iteration}") + logger.info(f"Difference threshold: {diff_threshold}") logger.info("GraphBin started") # Get the number of bins from the initial binning result # -------------------------------------------------------- - n_bins = 0 - - try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=delimiter) - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info("Number of bins available in the binning result: " + str(n_bins)) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the binning result file is provided and it is having the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Constructing the assembly graph") - - contig_descriptions = {} - - for label, _ in MinimalFastaParser(contigs_file): - name = label.split()[0] - contig_descriptions[name] = label - - # Get the links from the .asqg file - # ----------------------------------- - - links = [] - - contig_names = BidirectionalMap() - - my_map = BidirectionalMap() - - node_count = 0 - - try: - # Get contig connections from .asqg file - with open(assembly_graph_file) as file: + n_bins, bins_list = get_initial_bin_count(contig_bins_file, delimiter) - for line in file.readlines(): - line = line.strip() + # Get assembly graph + # -------------------- - # Count the number of contigs - if line.startswith("VT"): - start = "contig-" - end = "" - contig_name = str(line.split()[1]) - contig_num = int( - re.search("%s(.*)%s" % (start, end), contig_name).group(1) - ) - my_map[node_count] = contig_num - contig_names[node_count] = contig_name.strip() - node_count += 1 - - # Identify lines with link information - elif line.startswith("ED"): - link = [] - strings = line.split("\t")[1].split() - link.append(int(strings[0][7:])) - link.append(int(strings[1][7:])) - links.append(link) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - contigs_map = my_map - contigs_map_rev = my_map.inverse - - contig_names_rev = contig_names.inverse - - logger.info("Total number of contigs available: " + str(node_count)) - - ## Construct the assembly graph - # ------------------------------- - - try: - - # Create the graph - assembly_graph = Graph() - - # Create list of edges - edge_list = [] - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Name vertices - for i in range(len(assembly_graph.vs)): - assembly_graph.vs[i]["id"] = i - assembly_graph.vs[i]["label"] = str(contigs_map[i]) - - # Iterate links - for link in links: - # Remove self loops - if link[0] != link[1]: - # Add edge to list of edges - edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Total number of edges in the assembly graph: " + str(len(edge_list))) + assembly_graph, contigs_map, contig_names, node_count = parse_graph( + assembly_graph_file + ) # Get initial binning result # ---------------------------- - logger.info("Obtaining the initial binning result") - - bins = [[] for x in range(n_bins)] - - try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=delimiter) - for row in readCSV: - start = "contig-" - end = "" - contig_num = contigs_map_rev[ - int(re.search("%s(.*)%s" % (start, end), row[0]).group(1)) - ] - - bin_num = bins_list.index(row[1]) - bins[bin_num].append(contig_num) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Remove labels of ambiguous vertices - # ------------------------------------- - - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - neighbours_have_same_label_list = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - # Further remove labels of ambiguous vertices - binned_contigs = [] - - for n in range(n_bins): - binned_contigs = sorted(binned_contigs + bins[n]) - - for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices( - assembly_graph, i, binned_contigs - ) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - if my_bin in remove_by_bin: - if ( - len(bins[my_bin]) - len(remove_by_bin[my_bin]) - >= MIN_BIN_COUNT - ): - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - logger.info("Obtaining refined binning result") - - # Get vertices which are not isolated and not in components without any labels - # ----------------------------------------------------------------------------- - - logger.info( - "Deteremining vertices which are not isolated and not in components without any labels" + bins = get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map.inverse, delimiter ) - non_isolated = [] - - for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) + contig_descriptions = get_contig_descriptions(contigs_file) - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length != len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - - logger.info("Number of non-isolated contigs: " + str(len(non_isolated))) - - # Run label propagation - # ----------------------- - - data = [] - - for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i + 1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - - # Check if initial binning result consists of contigs belonging to multiple bins - - multiple_bins = False - - for item in data: - if type(item[1]) is int and type(item[2]) is int: - multiple_bins = True - break - - if multiple_bins: - logger.error( - "Initial binning result consists of contigs belonging to multiple bins. Please make sure that each contig in the initial binning result belongs to only one bin." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Label propagation - - lp = LabelProp() - - lp.load_data_from_mem(data) - - logger.info( - "Starting label propagation with eps=" - + str(diff_threshold) - + " and max_iteration=" - + str(max_iteration) - ) - - ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) - - logger.info("Obtaining Label Propagation result") - - for l in ans: - for i in range(n_bins): - if l[1] == i + 1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - # Remove labels of ambiguous vertices + # Run GraphBin logic # ------------------------------------- - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) + final_bins, remove_labels, non_isolated = graphbin_main( + n_bins, + bins, + bins_list, + assembly_graph, + node_count, + diff_threshold, + max_iteration, + ) elapsed_time = time.time() - start_time - logger.info("Obtaining the Final Refined Binning result") - - final_bins = {} - - for i in range(n_bins): - for contig in bins[i]: - final_bins[contig] = bins_list[i] - # Print elapsed time for the process - logger.info("Elapsed time: " + str(elapsed_time) + " seconds") + logger.info(f"Elapsed time: {elapsed_time} seconds") # Write result to output file # ----------------------------- - logger.info("Writing the Final Binning result to file") - - output_bins = [] - - output_bins_path = output_path + prefix + "bins/" - output_file = output_path + prefix + "graphbin_output.csv" - - if not os.path.isdir(output_bins_path): - subprocess.run("mkdir -p " + output_bins_path, shell=True) - - bin_files = {} - - for bin_name in set(final_bins.values()): - bin_files[bin_name] = open( - output_bins_path + prefix + "bin_" + bin_name + ".fasta", "w+" - ) - - for label, seq in MinimalFastaParser( - contigs_file, label_to_name=lambda x: x.split()[0] - ): - - contig_num = contig_names_rev[label] - - if contig_num in final_bins: - bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") - - # Close output files - for c in set(final_bins.values()): - bin_files[c].close() - - for b in range(len(bins)): - - for contig in bins[b]: - line = [] - line.append(contig_descriptions[contig_names[contig]]) - line.append(bins_list[b]) - output_bins.append(line) - - with open(output_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - for row in output_bins: - output_writer.writerow(row) - - logger.info("Final binning results can be found in " + str(output_bins_path)) - - unbinned_contigs = [] - - for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(contig_names[i]) - unbinned_contigs.append(line) - - if len(unbinned_contigs) != 0: - unbinned_file = output_path + prefix + "graphbin_unbinned.csv" - - with open(unbinned_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at " + unbinned_file) - - # Exit program - # -------------- - - logger.info("Thank you for using GraphBin! Bye...!") - - logger.removeHandler(fileHandler) - logger.removeHandler(consoleHeader) + write_output( + output_path, + prefix, + final_bins, + contigs_file, + contig_names.inverse, + bins, + contig_names, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, + contig_descriptions, + ) -def main(): - # Setup argument parser - # ----------------------- - ap = PARSER - args = ap.parse_args() +def main(args): run(args) diff --git a/src/graphbin/utils/graphbin_SPAdes.py b/src/graphbin/utils/graphbin_SPAdes.py index 7a90f1c..a374cc9 100755 --- a/src/graphbin/utils/graphbin_SPAdes.py +++ b/src/graphbin/utils/graphbin_SPAdes.py @@ -11,44 +11,32 @@ graphbin_SPAdes.py makes use of the assembly graphs produced by SPAdes. """ -import csv import logging -import os -import re -import subprocess -import sys import time -from collections import defaultdict -from cogent3.parse.fasta import MinimalFastaParser -from igraph import * +from graphbin.utils.graphbin_Func import graphbin_main +from graphbin.utils.parsers import get_initial_bin_count +from graphbin.utils.parsers.spades_parser import ( + get_initial_binning_result, + parse_graph, + write_output, +) -from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap -from graphbin.utils.graphbin_Func import getClosestLabelledVertices -from graphbin.utils.graphbin_Options import PARSER -from graphbin.utils.labelpropagation.labelprop import LabelProp __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" +# create logger +logger = logging.getLogger("GraphBin %s" % __version__) -def run(args): - # Setup logger - # ----------------------- - logger = logging.getLogger("GraphBin %s" % __version__) - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - consoleHeader = logging.StreamHandler() - consoleHeader.setFormatter(formatter) - consoleHeader.setLevel(logging.INFO) - logger.addHandler(consoleHeader) +def run(args): start_time = time.time() assembly_graph_file = args.graph @@ -60,15 +48,6 @@ def run(args): delimiter = args.delimiter max_iteration = args.max_iteration diff_threshold = args.diff_threshold - MIN_BIN_COUNT = 10 - - # Setup output path for log file - # --------------------------------------------------- - - fileHandler = logging.FileHandler(output_path + "/" + prefix + "graphbin.log") - fileHandler.setLevel(logging.DEBUG) - fileHandler.setFormatter(formatter) - logger.addHandler(fileHandler) logger.info( "Welcome to GraphBin: Refined Binning of Metagenomic Contigs using Assembly Graphs." @@ -78,597 +57,72 @@ def run(args): ) logger.info("Input arguments:") - logger.info("Assembly graph file: " + assembly_graph_file) - logger.info("Contig paths file: " + contig_paths) - logger.info("Existing binning output file: " + contig_bins_file) - logger.info("Final binning output file: " + output_path) - logger.info("Maximum number of iterations: " + str(max_iteration)) - logger.info("Difference threshold: " + str(diff_threshold)) + logger.info(f"Assembly graph file: {assembly_graph_file}") + logger.info(f"Contig paths file: {contig_paths}") + logger.info(f"Existing binning output file: {contig_bins_file}") + logger.info(f"Final binning output file: {output_path}") + logger.info(f"Maximum number of iterations: {max_iteration}") + logger.info(f"Difference threshold: {diff_threshold}") logger.info("GraphBin started") # Get the number of bins from the initial binning result # --------------------------------------------------- - n_bins = 0 - - try: - all_bins_list = [] - - with open(contig_bins_file) as csvfile: - readCSV = csv.reader(csvfile, delimiter=delimiter) - for row in readCSV: - all_bins_list.append(row[1]) - - bins_list = list(set(all_bins_list)) - bins_list.sort() - - n_bins = len(bins_list) - logger.info( - "Number of bins available in the initial binning result: " + str(n_bins) - ) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the initial binning result file is provided and it is having the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Constructing the assembly graph") - - # Get contig paths from contigs.paths - # ------------------------------------- - - paths = {} - segment_contigs = {} - node_count = 0 - - contig_names = BidirectionalMap() - - my_map = BidirectionalMap() - - current_contig_num = "" - - try: - with open(contig_paths) as file: - name = file.readline() - path = file.readline() - - while name != "" and path != "": - - while ";" in path: - path = path[:-2] + "," + file.readline() - - start = "NODE_" - end = "_length_" - contig_num = str( - int(re.search("%s(.*)%s" % (start, end), name).group(1)) - ) - - segments = path.rstrip().split(",") - - if current_contig_num != contig_num: - my_map[node_count] = int(contig_num) - current_contig_num = contig_num - contig_names[node_count] = name.strip() - node_count += 1 - - if contig_num not in paths: - paths[contig_num] = [segments[0], segments[-1]] - - for segment in segments: - if segment not in segment_contigs: - segment_contigs[segment] = set([contig_num]) - else: - segment_contigs[segment].add(contig_num) - - name = file.readline() - path = file.readline() - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the contig paths file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - contigs_map = my_map - contigs_map_rev = my_map.inverse - - contig_names_rev = contig_names.inverse - - logger.info("Total number of contigs available: " + str(node_count)) - - links = [] - links_map = defaultdict(set) - - ## Construct the assembly graph - # ------------------------------- - - try: - # Get links from assembly_graph_with_scaffolds.gfa - with open(assembly_graph_file) as file: - - for line in file.readlines(): - line = line.strip() - - # Identify lines with link information - if line.startswith("L"): - strings = line.split("\t") - f1, f2 = strings[1] + strings[2], strings[3] + strings[4] - links_map[f1].add(f2) - links_map[f2].add(f1) - links.append( - strings[1] + strings[2] + " " + strings[3] + strings[4] - ) - - # Create graph - assembly_graph = Graph() - - # Add vertices - assembly_graph.add_vertices(node_count) - - # Create list of edges - edge_list = [] - - # Name vertices - for i in range(node_count): - assembly_graph.vs[i]["id"] = i - assembly_graph.vs[i]["label"] = str(contigs_map[i]) - - for i in range(len(paths)): - segments = paths[str(contigs_map[i])] - - start = segments[0] - start_rev = "" - - if start.endswith("+"): - start_rev = start[:-1] + "-" - else: - start_rev = start[:-1] + "+" + n_bins, bins_list = get_initial_bin_count(contig_bins_file, delimiter) - end = segments[1] - end_rev = "" + # Get assembly graph + # -------------------- - if end.endswith("+"): - end_rev = end[:-1] + "-" - else: - end_rev = end[:-1] + "+" - - new_links = [] - - if start in links_map: - new_links.extend(list(links_map[start])) - if start_rev in links_map: - new_links.extend(list(links_map[start_rev])) - if end in links_map: - new_links.extend(list(links_map[end])) - if end_rev in links_map: - new_links.extend(list(links_map[end_rev])) - - for new_link in new_links: - if new_link in segment_contigs: - for contig in segment_contigs[new_link]: - if i != contigs_map_rev[int(contig)]: - # Add edge to list of edges - edge_list.append((i, contigs_map_rev[int(contig)])) - - # Add edges to the graph - assembly_graph.add_edges(edge_list) - assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that the correct path to the assembly graph file is provided." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - logger.info("Total number of edges in the assembly graph: " + str(len(edge_list))) + assembly_graph, contigs_map, contig_names, node_count = parse_graph( + assembly_graph_file, contig_paths + ) # Get initial binning result # ---------------------------- - logger.info("Obtaining the initial binning result") - - bins = [[] for x in range(n_bins)] - - try: - with open(contig_bins_file) as contig_bins: - readCSV = csv.reader(contig_bins, delimiter=delimiter) - for row in readCSV: - start = "NODE_" - end = "_length_" - contig_num = contigs_map_rev[ - int(re.search("%s(.*)%s" % (start, end), row[0]).group(1)) - ] - - bin_num = bins_list.index(row[1]) - bins[bin_num].append(contig_num) - - except BaseException as err: - logger.error(f"Unexpected {err}") - logger.error( - "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Remove labels of ambiguous vertices - # ------------------------------------- - - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - neighbours_have_same_label_list = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - # Get set of closest labelled vertices with distance = 1 - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - neighbours_binned = False - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - neighbours_binned = True - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - elif neighbours_binned: - neighbours_have_same_label_list.append(i) - - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - # Further remove labels of ambiguous vertices - binned_contigs = [] - - for n in range(n_bins): - binned_contigs = sorted(binned_contigs + bins[n]) - - for b in range(n_bins): - - for i in bins[b]: - - if i not in neighbours_have_same_label_list: - - my_bin = b - - # Get set of closest labelled vertices - closest_neighbours = getClosestLabelledVertices( - assembly_graph, i, binned_contigs - ) - - if len(closest_neighbours) > 0: - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label and i not in remove_labels: - if my_bin in remove_by_bin: - if ( - len(bins[my_bin]) - len(remove_by_bin[my_bin]) - >= MIN_BIN_COUNT - ): - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) - - logger.info("Obtaining the refined binning result") - - # Get vertices which are not isolated and not in components without any labels - # ----------------------------------------------------------------------------- - - logger.info( - "Deteremining vertices which are not isolated and not in components without any labels" + bins = get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map.inverse, delimiter ) - non_isolated = [] - - for i in range(node_count): - - if i not in non_isolated and i in binned_contigs: - - component = [] - component.append(i) - length = len(component) - neighbours = assembly_graph.neighbors(i, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - component = list(set(component)) - - while length != len(component): - - length = len(component) - - for j in component: - - neighbours = assembly_graph.neighbors(j, mode=ALL) - - for neighbor in neighbours: - if neighbor not in component: - component.append(neighbor) - - labelled = False - for j in component: - if j in binned_contigs: - labelled = True - break - - if labelled: - for j in component: - if j not in non_isolated: - non_isolated.append(j) - - logger.info("Number of non-isolated contigs: " + str(len(non_isolated))) - - # Run label propagation - # ----------------------- - - data = [] - - for contig in range(node_count): - - # Consider vertices that are not isolated - - if contig in non_isolated: - line = [] - line.append(contig) - - assigned = False - - for i in range(n_bins): - if contig in bins[i]: - line.append(i + 1) - assigned = True - - if not assigned: - line.append(0) - - neighbours = assembly_graph.neighbors(contig, mode=ALL) - - neighs = [] - - for neighbour in neighbours: - n = [] - n.append(neighbour) - n.append(1.0) - neighs.append(n) - - line.append(neighs) - - data.append(line) - - # Check if initial binning result consists of contigs belonging to multiple bins - - multiple_bins = False - - for item in data: - if type(item[1]) is int and type(item[2]) is int: - multiple_bins = True - break - - if multiple_bins: - logger.error( - "Initial binning result consists of contigs belonging to multiple bins. Please make sure that each contig in the initial binning result belongs to only one bin." - ) - logger.info("Exiting GraphBin... Bye...!") - sys.exit(1) - - # Label propagation - - lp = LabelProp() - - lp.load_data_from_mem(data) - - logger.info( - "Starting label propagation with eps=" - + str(diff_threshold) - + " and max_iteration=" - + str(max_iteration) - ) - - ans = lp.run(diff_threshold, max_iteration, show_log=True, clean_result=False) - - logger.info("Obtaining Label Propagation result") - - for l in ans: - for i in range(n_bins): - if l[1] == i + 1 and l[0] not in bins[i]: - bins[i].append(l[0]) - - # Remove labels of ambiguous vertices + # Run GraphBin logic # ------------------------------------- - logger.info("Determining ambiguous vertices") - - remove_by_bin = {} - - remove_labels = [] - - for b in range(n_bins): - - for i in bins[b]: - - my_bin = b - - closest_neighbours = assembly_graph.neighbors(i, mode=ALL) - - # Determine whether all the closest labelled vertices have the same label as its own - neighbours_have_same_label = True - - for neighbour in closest_neighbours: - for k in range(n_bins): - if neighbour in bins[k]: - if k != my_bin: - neighbours_have_same_label = False - break - - if not neighbours_have_same_label: - if my_bin in remove_by_bin: - if len(bins[my_bin]) - len(remove_by_bin[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin].append(i) - else: - if len(bins[my_bin]) >= MIN_BIN_COUNT: - remove_labels.append(i) - remove_by_bin[my_bin] = [i] - - logger.info("Removing labels of ambiguous vertices") - - # Remove labels of ambiguous vertices - for i in remove_labels: - for n in range(n_bins): - if i in bins[n]: - bins[n].remove(i) + final_bins, remove_labels, non_isolated = graphbin_main( + n_bins, + bins, + bins_list, + assembly_graph, + node_count, + diff_threshold, + max_iteration, + ) elapsed_time = time.time() - start_time - logger.info("Obtaining the Final Refined Binning result") - - final_bins = {} - - for i in range(n_bins): - for contig in bins[i]: - final_bins[contig] = bins_list[i] - # Print elapsed time for the process - logger.info("Elapsed time: " + str(elapsed_time) + " seconds") + logger.info(f"Elapsed time: {elapsed_time} seconds") # Write result to output file # ----------------------------- - logger.info("Writing the Final Binning result to file") - - output_bins = [] - - output_bins_path = output_path + prefix + "bins/" - output_file = output_path + prefix + "graphbin_output.csv" - - if not os.path.isdir(output_bins_path): - subprocess.run("mkdir -p " + output_bins_path, shell=True) - - bin_files = {} - - for bin_name in set(final_bins.values()): - bin_files[bin_name] = open( - output_bins_path + prefix + "bin_" + bin_name + ".fasta", "w+" - ) - - for label, seq in MinimalFastaParser(contigs_file): - - contig_num = contig_names_rev[label] - - if contig_num in final_bins: - bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") - - # Close output files - for c in set(final_bins.values()): - bin_files[c].close() - - for b in range(len(bins)): - - for contig in bins[b]: - line = [] - line.append(contig_names[contig]) - line.append(bins_list[b]) - output_bins.append(line) - - with open(output_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - for row in output_bins: - output_writer.writerow(row) - - logger.info("Final binning results can be found in " + str(output_bins_path)) - - unbinned_contigs = [] - - for i in range(node_count): - if i in remove_labels or i not in non_isolated: - line = [] - line.append(contig_names[i]) - unbinned_contigs.append(line) - - if len(unbinned_contigs) != 0: - unbinned_file = output_path + prefix + "graphbin_unbinned.csv" - - with open(unbinned_file, mode="w") as out_file: - output_writer = csv.writer( - out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL - ) - - for row in unbinned_contigs: - output_writer.writerow(row) - - logger.info("Unbinned contigs can be found at " + unbinned_file) - - # Exit program - # -------------- - - logger.info("Thank you for using GraphBin! Bye...!") - - logger.removeHandler(fileHandler) - logger.removeHandler(consoleHeader) + write_output( + output_path, + prefix, + final_bins, + contigs_file, + contig_names.inverse, + bins, + contig_names, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, + ) -def main(): - # Setup argument parser - # --------------------------------------------------- - ap = PARSER - ap.add_argument("--paths", type=str, help="path to the contigs.paths file") - args = ap.parse_args() +def main(args): run(args) diff --git a/src/graphbin/utils/labelpropagation/labelprop.py b/src/graphbin/utils/labelpropagation/labelprop.py index 4c825a7..f96bc0b 100644 --- a/src/graphbin/utils/labelpropagation/labelprop.py +++ b/src/graphbin/utils/labelpropagation/labelprop.py @@ -6,11 +6,12 @@ import logging + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Lingzhe Teng", "Vijini Mallawaarachchi"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Production" @@ -48,7 +49,6 @@ def initialize_env(self): self.labelled_size = 0 def setup_env(self): - # initialize vertex_in_adj_map for vertex_id in self.vertex_adj_map.keys(): if vertex_id not in self.vertex_in_adj_map: @@ -101,14 +101,6 @@ def setup_env(self): arr.append(0.0) self.vertex_f_map.setdefault(v, arr) - def load_data_from_file(self, filename): - import ast - - with open(filename, "rb") as f: - lines = [ast.literal_eval(_.strip()) for _ in f.readlines()] - self.vertex_size = len(lines) - self.load_data_from_mem(lines) - def load_data_from_mem(self, data): self.initialize_env() self.vertex_size = len(data) @@ -121,21 +113,16 @@ def process_data_line(self, line): # unlabeled vertex if vertexLabel == 0 # i.e. [2, 1, [[1, 1.0], [3, 1.0]]] - try: - vertex_id = line[0] - vertex_label = line[1] - edges = line[2] - edge_list = [] - self.vertex_label_map.setdefault(vertex_id, vertex_label) - for edge in edges: - dest_vertex_id = int(edge[0]) - edge_weight = float(edge[1]) - edge_list.append(Edge(vertex_id, dest_vertex_id, edge_weight)) - self.vertex_adj_map.setdefault(vertex_id, edge_list) - - except Exception as e: - - raise Exception("Coundn't parse vertex from line") + vertex_id = line[0] + vertex_label = line[1] + edges = line[2] + edge_list = [] + self.vertex_label_map.setdefault(vertex_id, vertex_label) + for edge in edges: + dest_vertex_id = int(edge[0]) + edge_weight = float(edge[1]) + edge_list.append(Edge(vertex_id, dest_vertex_id, edge_weight)) + self.vertex_adj_map.setdefault(vertex_id, edge_list) ################################################################################ # Label Propagation diff --git a/src/graphbin/utils/parsers/__init__.py b/src/graphbin/utils/parsers/__init__.py new file mode 100644 index 0000000..011b684 --- /dev/null +++ b/src/graphbin/utils/parsers/__init__.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import csv +import logging +import re +import sys + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Production" + + +logger = logging.getLogger("GraphBin %s" % __version__) + + +def get_initial_bin_count(contig_bins_file, delimiter): + n_bins = 0 + + try: + all_bins_list = [] + + with open(contig_bins_file) as csvfile: + readCSV = csv.reader(csvfile, delimiter=delimiter) + for row in readCSV: + all_bins_list.append(row[1]) + + bins_list = list(set(all_bins_list)) + bins_list.sort() + + n_bins = len(bins_list) + logger.info( + "Number of bins available in the initial binning result: " + str(n_bins) + ) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the initial binning result file is provided and it is having the correct format." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + return n_bins, bins_list diff --git a/src/graphbin/utils/parsers/canu_parser.py b/src/graphbin/utils/parsers/canu_parser.py new file mode 100644 index 0000000..9bb4559 --- /dev/null +++ b/src/graphbin/utils/parsers/canu_parser.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +import csv +import logging +import os +import subprocess +import sys + +from cogent3.parse.fasta import MinimalFastaParser +from igraph import * + +from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Production" + + +logger = logging.getLogger("GraphBin %s" % __version__) + + +def get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map_rev, delimiter +): + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=delimiter) + for row in readCSV: + contig_num = contigs_map_rev[row[0]] + + bin_num = bins_list.index(row[1]) + bins[bin_num].append(contig_num) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + return bins + + +def parse_graph(assembly_graph_file): + # Get the links from the .gfa file + # ----------------------------------- + + my_map = BidirectionalMap() + + node_count = 0 + + nodes = [] + + links = [] + + try: + # Get contig connections from .gfa file + with open(assembly_graph_file) as file: + for line in file.readlines(): + line = line.strip() + + # Count the number of contigs + if line.startswith("S"): + strings = line.split("\t") + my_node = strings[1] + my_map[node_count] = my_node + nodes.append(my_node) + node_count += 1 + + # Identify lines with link information + elif line.startswith("L"): + link = [] + strings = line.split("\t") + + if strings[1] != strings[3]: + start = strings[1] + end = strings[3] + link.append(start) + link.append(end) + links.append(link) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info(f"Total number of contigs available: {node_count}") + + ## Construct the assembly graph + # ------------------------------- + + try: + # Create the graph + assembly_graph = Graph() + + # Create list of edges + edge_list = [] + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"] = i + assembly_graph.vs[i]["label"] = str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info(f"Total number of edges in the assembly graph: {len(edge_list)}") + + return assembly_graph, contigs_map, node_count + + +def write_output( + output_path, + prefix, + final_bins, + contigs_file, + contigs_map_rev, + bins, + contigs_map, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, +): + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + output_bins_path = f"{output_path}{prefix}bins/" + output_file = f"{output_path}{prefix}graphbin_output.csv" + + if not os.path.isdir(output_bins_path): + subprocess.run(f"mkdir -p {output_bins_path}", shell=True) + + bin_files = {} + + for bin_name in set(final_bins.values()): + bin_files[bin_name] = open( + f"{output_bins_path}{prefix}bin_{bin_name}.fasta", "w+" + ) + + for label, seq in MinimalFastaParser( + contigs_file, label_to_name=lambda x: x.split()[0] + ): + contig_num = contigs_map_rev[label] + + if contig_num in final_bins: + bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") + + # Close output files + for c in set(final_bins.values()): + bin_files[c].close() + + for b in range(len(bins)): + for contig in bins[b]: + line = [] + line.append(str(contigs_map[contig])) + line.append(bins_list[b]) + output_bins.append(line) + + with open(output_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + for row in output_bins: + output_writer.writerow(row) + + logger.info(f"Final binning results can be found in {output_bins_path}") + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs) != 0: + unbinned_file = f"{output_path}{prefix}graphbin_unbinned.csv" + + with open(unbinned_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info(f"Unbinned contigs can be found at {unbinned_file}") diff --git a/src/graphbin/utils/parsers/flye_parser.py b/src/graphbin/utils/parsers/flye_parser.py new file mode 100644 index 0000000..0c30216 --- /dev/null +++ b/src/graphbin/utils/parsers/flye_parser.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 + +import csv +import logging +import os +import subprocess +import sys + +from collections import defaultdict + +from cogent3.parse.fasta import MinimalFastaParser +from igraph import * + +from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Production" + + +logger = logging.getLogger("GraphBin %s" % __version__) + + +def get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contig_names_rev, delimiter +): + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=delimiter) + for row in readCSV: + contig_num = contig_names_rev[row[0]] + + bin_num = bins_list.index(row[1]) + bins[bin_num].append(contig_num) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + return bins + + +def parse_graph(assembly_graph_file, contig_paths): + # Get contig names + # ----------------------------------- + + contig_names = BidirectionalMap() + + contig_num = 0 + + with open(contig_paths, "r") as file: + for line in file.readlines(): + if not line.startswith("#"): + name = line.strip().split()[0] + contig_names[contig_num] = name + contig_num += 1 + + contig_names_rev = contig_names.inverse + + # Get the paths and edges + # ----------------------------------- + + paths = {} + segment_contigs = {} + + try: + with open(contig_paths) as file: + for line in file.readlines(): + if not line.startswith("#"): + strings = line.strip().split() + + contig_name = strings[0] + + path = strings[-1] + path = path.replace("*", "") + + if path.startswith(","): + path = path[1:] + + if path.endswith(","): + path = path[:-1] + + segments = path.rstrip().split(",") + + contig_num = contig_names_rev[contig_name] + + if contig_num not in paths: + paths[contig_num] = segments + + for segment in segments: + if segment != "": + if segment not in segment_contigs: + segment_contigs[segment] = set([contig_num]) + else: + segment_contigs[segment].add(contig_num) + + links_map = defaultdict(set) + + # Get links from assembly_graph.gfa + with open(assembly_graph_file) as file: + for line in file.readlines(): + line = line.strip() + + # Identify lines with link information + if line.startswith("L"): + strings = line.split("\t") + + f1, f2 = "", "" + + if strings[2] == "+": + f1 = strings[1][5:] + if strings[2] == "-": + f1 = "-" + strings[1][5:] + if strings[4] == "+": + f2 = strings[3][5:] + if strings[4] == "-": + f2 = "-" + strings[3][5:] + + links_map[f1].add(f2) + links_map[f2].add(f1) + + # Create list of edges + edge_list = [] + + for i in paths: + segments = paths[i] + + new_links = [] + + for segment in segments: + my_segment = segment + my_segment_num = "" + + my_segment_rev = "" + + if my_segment.startswith("-"): + my_segment_rev = my_segment[1:] + my_segment_num = my_segment[1:] + else: + my_segment_rev = "-" + my_segment + my_segment_num = my_segment + + if my_segment in links_map: + new_links.extend(list(links_map[my_segment])) + + if my_segment_rev in links_map: + new_links.extend(list(links_map[my_segment_rev])) + + if my_segment in segment_contigs: + for contig in segment_contigs[my_segment]: + if i != contig: + # Add edge to list of edges + edge_list.append((i, contig)) + + if my_segment_rev in segment_contigs: + for contig in segment_contigs[my_segment_rev]: + if i != contig: + # Add edge to list of edges + edge_list.append((i, contig)) + + if my_segment_num in segment_contigs: + for contig in segment_contigs[my_segment_num]: + if i != contig: + # Add edge to list of edges + edge_list.append((i, contig)) + + for new_link in new_links: + if new_link in segment_contigs: + for contig in segment_contigs[new_link]: + if i != contig: + # Add edge to list of edges + edge_list.append((i, contig)) + + if new_link.startswith("-"): + if new_link[1:] in segment_contigs: + for contig in segment_contigs[new_link[1:]]: + if i != contig: + # Add edge to list of edges + edge_list.append((i, contig)) + + node_count = len(contig_names_rev) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info(f"Total number of contigs available: {node_count}") + + ## Construct the assembly graph + # ------------------------------- + + try: + # Create the graph + assembly_graph = Graph() + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"] = i + assembly_graph.vs[i]["label"] = str(contig_names[i]) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info(f"Total number of edges in the assembly graph: {len(edge_list)}") + + return assembly_graph, contig_names, node_count + + +def write_output( + output_path, + prefix, + final_bins, + contigs_file, + contig_names_rev, + bins, + contig_names, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, +): + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + output_bins_path = f"{output_path}{prefix}bins/" + output_file = f"{output_path}{prefix}graphbin_output.csv" + + if not os.path.isdir(output_bins_path): + subprocess.run(f"mkdir -p {output_bins_path}", shell=True) + + bin_files = {} + + for bin_name in set(final_bins.values()): + bin_files[bin_name] = open( + f"{output_bins_path}{prefix}bin_{bin_name}.fasta", "w+" + ) + + for label, seq in MinimalFastaParser(contigs_file): + contig_num = contig_names_rev[label] + + if contig_num in final_bins: + bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") + + # Close output files + for c in set(final_bins.values()): + bin_files[c].close() + + for b in range(len(bins)): + # with open(output_bins_path + "bin_" + str(b+1) + "_ids.txt", "w") as bin_file: + for contig in bins[b]: + line = [] + line.append(str(contig_names[contig])) + line.append(bins_list[b]) + output_bins.append(line) + # bin_file.write(str(contig_names[contig])+"\n") + + # subprocess.run("awk -F'>' 'NR==FNR{ids[$0]; next} NF>1{f=($2 in ids)} f' " + output_bins_path + "bin_" + str(b+1) + "_ids.txt " + contigs_file + " > " + output_bins_path + "bin_" +str(b+1) +"_seqs.fasta", shell=True) + + with open(output_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + for row in output_bins: + output_writer.writerow(row) + + logger.info(f"Final binning results can be found in {output_bins_path}") + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(str(contig_names[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs) != 0: + unbinned_file = f"{output_path}{prefix}graphbin_unbinned.csv" + + with open(unbinned_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info(f"Unbinned contigs can be found at {unbinned_file}") diff --git a/src/graphbin/utils/parsers/megahit_parser.py b/src/graphbin/utils/parsers/megahit_parser.py new file mode 100644 index 0000000..83c93c6 --- /dev/null +++ b/src/graphbin/utils/parsers/megahit_parser.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 + +import csv +import logging +import os +import re +import subprocess +import sys + +from cogent3.parse.fasta import MinimalFastaParser +from igraph import * + +from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Production" + + +logger = logging.getLogger("GraphBin %s" % __version__) + + +def get_initial_binning_result( + n_bins, + bins_list, + contig_bins_file, + contigs_map_rev, + graph_to_contig_map_rev, + delimiter, +): + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=delimiter) + for row in readCSV: + contig_num = contigs_map_rev[int(graph_to_contig_map_rev[row[0]])] + + bin_num = bins_list.index(row[1]) + bins[bin_num].append(contig_num) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + return bins + + +def parse_graph(assembly_graph_file, original_contigs): + node_count = 0 + + graph_contigs = {} + + links = [] + + my_map = BidirectionalMap() + + try: + # Get links from .gfa file + with open(assembly_graph_file) as file: + for line in file.readlines(): + line = line.strip() + + # Identify lines with link information + if line.startswith("L"): + link = [] + + strings = line.split("\t") + + start_1 = "NODE_" + end_1 = "_length" + + link1 = int( + re.search("%s(.*)%s" % (start_1, end_1), strings[1]).group(1) + ) + + start_2 = "NODE_" + end_2 = "_length" + + link2 = int( + re.search("%s(.*)%s" % (start_2, end_2), strings[3]).group(1) + ) + + link.append(link1) + link.append(link2) + links.append(link) + + elif line.startswith("S"): + strings = line.split() + + start = "NODE_" + end = "_length" + + contig_num = int( + re.search("%s(.*)%s" % (start, end), strings[1]).group(1) + ) + + my_map[node_count] = int(contig_num) + + graph_contigs[contig_num] = strings[2] + + node_count += 1 + + logger.info(f"Total number of contigs available: {node_count}") + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + # Create graph + assembly_graph = Graph() + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Create list of edges + edge_list = [] + + for i in range(node_count): + assembly_graph.vs[i]["id"] = i + assembly_graph.vs[i]["label"] = str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info(f"Total number of edges in the assembly graph: {len(edge_list)}") + + # Map original contig IDs to contig IDS of assembly graph + # -------------------------------------------------------- + + graph_to_contig_map = BidirectionalMap() + + for (n, m), (n2, m2) in zip(graph_contigs.items(), original_contigs.items()): + if m == m2: + graph_to_contig_map[n] = n2 + + return assembly_graph, graph_to_contig_map, contigs_map, node_count + + +def write_output( + output_path, + prefix, + final_bins, + contigs_file, + graph_to_contig_map, + bins, + contigs_map, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, +): + logger.info("Writing the Final Binning result to file") + + output_bins = [] + graph_to_contig_map_rev = graph_to_contig_map.inverse + contigs_map_rev = contigs_map.inverse + + output_bins_path = f"{output_path}{prefix}bins/" + output_file = f"{output_path}{prefix}graphbin_output.csv" + + if not os.path.isdir(output_bins_path): + subprocess.run(f"mkdir -p {output_bins_path}", shell=True) + + bin_files = {} + + for bin_name in set(final_bins.values()): + bin_files[bin_name] = open( + f"{output_bins_path}{prefix}bin_{bin_name}.fasta", "w+" + ) + + for label, seq in MinimalFastaParser( + contigs_file, label_to_name=lambda x: x.split()[0] + ): + contig_num = contigs_map_rev[graph_to_contig_map_rev[label]] + + if contig_num in final_bins: + bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") + + # Close output files + for c in set(final_bins.values()): + bin_files[c].close() + + for b in range(len(bins)): + for contig in bins[b]: + line = [] + line.append(graph_to_contig_map[contigs_map[contig]]) + line.append(bins_list[b]) + output_bins.append(line) + + with open(output_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + for row in output_bins: + output_writer.writerow(row) + + logger.info(f"Final binning results can be found in {output_bins_path}") + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(graph_to_contig_map[contigs_map[i]]) + unbinned_contigs.append(line) + + if len(unbinned_contigs) != 0: + unbinned_file = f"{output_path}{prefix}graphbin_unbinned.csv" + + with open(unbinned_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info(f"Unbinned contigs can be found at {unbinned_file}") + + +def get_contig_descriptors(contigs_file): + original_contigs = {} + contig_descriptions = {} + + for label, seq in MinimalFastaParser(contigs_file): + name = label.split()[0] + original_contigs[name] = seq + contig_descriptions[name] = label + + return original_contigs diff --git a/src/graphbin/utils/parsers/miniasm_parser.py b/src/graphbin/utils/parsers/miniasm_parser.py new file mode 100644 index 0000000..a0d4439 --- /dev/null +++ b/src/graphbin/utils/parsers/miniasm_parser.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 + +import csv +import logging +import os +import subprocess +import sys + +from cogent3.parse.fasta import MinimalFastaParser +from igraph import * + +from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Production" + + +logger = logging.getLogger("GraphBin %s" % __version__) + + +def get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map_rev, delimiter +): + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=delimiter) + for row in readCSV: + contig_num = contigs_map_rev[row[0]] + + bin_num = bins_list.index(row[1]) + bins[bin_num].append(contig_num) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + return bins + + +def parse_graph(assembly_graph_file): + # Get the links from the .gfa file + # ----------------------------------- + + my_map = BidirectionalMap() + + node_count = 0 + + nodes = [] + + links = [] + + try: + # Get contig connections from .gfa file + with open(assembly_graph_file) as file: + for line in file.readlines(): + line = line.strip() + + # Count the number of contigs + if line.startswith("S"): + strings = line.split("\t") + my_node = strings[1] + my_map[node_count] = my_node + nodes.append(my_node) + node_count += 1 + + # Identify lines with link information + elif line.startswith("L"): + link = [] + strings = line.split("\t") + + if strings[1] != strings[3]: + start = strings[1] + end = strings[3] + link.append(start) + link.append(end) + links.append(link) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info(f"Total number of contigs available: {node_count}") + + ## Construct the assembly graph + # ------------------------------- + + try: + # Create the graph + assembly_graph = Graph() + + # Create list of edges + edge_list = [] + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"] = i + assembly_graph.vs[i]["label"] = str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info(f"Total number of edges in the assembly graph: {len(edge_list)}") + + return assembly_graph, contigs_map, node_count + + +def write_output( + output_path, + prefix, + final_bins, + contigs_file, + contigs_map, + bins, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, +): + logger.info("Writing the Final Binning result to file") + + output_bins = [] + contigs_map_rev = contigs_map.inverse + + output_bins_path = f"{output_path}{prefix}bins/" + output_file = f"{output_path}{prefix}graphbin_output.csv" + + if not os.path.isdir(output_bins_path): + subprocess.run(f"mkdir -p {output_bins_path}", shell=True) + + bin_files = {} + + for bin_name in set(final_bins.values()): + bin_files[bin_name] = open( + f"{output_bins_path}{prefix}bin_{bin_name}.fasta", "w+" + ) + + for label, seq in MinimalFastaParser(contigs_file): + contig_num = contigs_map_rev[label] + + if contig_num in final_bins: + bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") + + # Close output files + for c in set(final_bins.values()): + bin_files[c].close() + + for b in range(len(bins)): + for contig in bins[b]: + line = [] + line.append(str(contigs_map[contig])) + line.append(bins_list[b]) + output_bins.append(line) + + with open(output_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + for row in output_bins: + output_writer.writerow(row) + + logger.info(f"Final binning results can be found in {output_bins_path}") + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(str(contigs_map[i])) + unbinned_contigs.append(line) + + if len(unbinned_contigs) != 0: + unbinned_file = f"{output_path}{prefix}graphbin_unbinned.csv" + + with open(unbinned_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info(f"Unbinned contigs can be found at {unbinned_file}") diff --git a/src/graphbin/utils/parsers/sga_parser.py b/src/graphbin/utils/parsers/sga_parser.py new file mode 100644 index 0000000..3dea5e3 --- /dev/null +++ b/src/graphbin/utils/parsers/sga_parser.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 + +import csv +import logging +import os +import re +import subprocess +import sys + +from cogent3.parse.fasta import MinimalFastaParser +from igraph import * + +from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Production" + + +logger = logging.getLogger("GraphBin %s" % __version__) + + +def get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map_rev, delimiter +): + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=delimiter) + for row in readCSV: + start = "contig-" + end = "" + contig_num = contigs_map_rev[ + int(re.search("%s(.*)%s" % (start, end), row[0]).group(1)) + ] + + bin_num = bins_list.index(row[1]) + bins[bin_num].append(contig_num) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + return bins + + +def parse_graph(assembly_graph_file): + links = [] + + contig_names = BidirectionalMap() + + my_map = BidirectionalMap() + + node_count = 0 + + try: + # Get contig connections from .asqg file + with open(assembly_graph_file) as file: + for line in file.readlines(): + line = line.strip() + + # Count the number of contigs + if line.startswith("VT"): + start = "contig-" + end = "" + contig_name = str(line.split()[1]) + contig_num = int( + re.search("%s(.*)%s" % (start, end), contig_name).group(1) + ) + my_map[node_count] = contig_num + contig_names[node_count] = contig_name.strip() + node_count += 1 + + # Identify lines with link information + elif line.startswith("ED"): + link = [] + strings = line.split("\t")[1].split() + link.append(int(strings[0][7:])) + link.append(int(strings[1][7:])) + links.append(link) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + contig_names_rev = contig_names.inverse + + logger.info(f"Total number of contigs available: {node_count}") + + try: + # Create the graph + assembly_graph = Graph() + + # Create list of edges + edge_list = [] + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Name vertices + for i in range(len(assembly_graph.vs)): + assembly_graph.vs[i]["id"] = i + assembly_graph.vs[i]["label"] = str(contigs_map[i]) + + # Iterate links + for link in links: + # Remove self loops + if link[0] != link[1]: + # Add edge to list of edges + edge_list.append((contigs_map_rev[link[0]], contigs_map_rev[link[1]])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info(f"Total number of edges in the assembly graph: {len(edge_list)}") + + return assembly_graph, contigs_map, contig_names, node_count + + +def write_output( + output_path, + prefix, + final_bins, + contigs_file, + contig_names_rev, + bins, + contig_names, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, + contig_descriptions, +): + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + output_bins_path = f"{output_path}{prefix}bins/" + output_file = f"{output_path}{prefix}graphbin_output.csv" + + if not os.path.isdir(output_bins_path): + subprocess.run(f"mkdir -p {output_bins_path}", shell=True) + + bin_files = {} + + for bin_name in set(final_bins.values()): + bin_files[bin_name] = open( + f"{output_bins_path}{prefix}bin_{bin_name}.fasta", "w+" + ) + + for label, seq in MinimalFastaParser( + contigs_file, label_to_name=lambda x: x.split()[0] + ): + contig_num = contig_names_rev[label] + + if contig_num in final_bins: + bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") + + # Close output files + for c in set(final_bins.values()): + bin_files[c].close() + + for b in range(len(bins)): + for contig in bins[b]: + line = [] + line.append(contig_descriptions[contig_names[contig]]) + line.append(bins_list[b]) + output_bins.append(line) + + with open(output_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + for row in output_bins: + output_writer.writerow(row) + + logger.info(f"Final binning results can be found in {output_bins_path}") + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(contig_names[i]) + unbinned_contigs.append(line) + + if len(unbinned_contigs) != 0: + unbinned_file = f"{output_path}{prefix}graphbin_unbinned.csv" + + with open(unbinned_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info(f"Unbinned contigs can be found at {unbinned_file}") + + +def get_contig_descriptions(contigs_file): + contig_descriptions = {} + + for label, _ in MinimalFastaParser(contigs_file): + name = label.split()[0] + contig_descriptions[name] = label + + return contig_descriptions diff --git a/src/graphbin/utils/parsers/spades_parser.py b/src/graphbin/utils/parsers/spades_parser.py new file mode 100644 index 0000000..ceb428e --- /dev/null +++ b/src/graphbin/utils/parsers/spades_parser.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 + +import csv +import logging +import os +import re +import subprocess +import sys + +from collections import defaultdict + +from cogent3.parse.fasta import MinimalFastaParser +from igraph import * + +from graphbin.utils.bidirectionalmap.bidirectionalmap import BidirectionalMap + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Anuradha Wickramarachchi", "Yu Lin"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Production" + + +logger = logging.getLogger("GraphBin %s" % __version__) + + +def get_initial_binning_result( + n_bins, bins_list, contig_bins_file, contigs_map_rev, delimiter +): + logger.info("Obtaining the initial binning result") + + bins = [[] for x in range(n_bins)] + + try: + with open(contig_bins_file) as contig_bins: + readCSV = csv.reader(contig_bins, delimiter=delimiter) + for row in readCSV: + start = "NODE_" + end = "_length_" + contig_num = contigs_map_rev[ + int(re.search("%s(.*)%s" % (start, end), row[0]).group(1)) + ] + + bin_num = bins_list.index(row[1]) + bins[bin_num].append(contig_num) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that you have provided the correct assembler type and the correct path to the binning result file in the correct format." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + return bins + + +def parse_graph(assembly_graph_file, contig_paths): + paths = {} + segment_contigs = {} + node_count = 0 + + contig_names = BidirectionalMap() + + my_map = BidirectionalMap() + + current_contig_num = "" + + try: + with open(contig_paths) as file: + name = file.readline() + path = file.readline() + + while name != "" and path != "": + while ";" in path: + path = path[:-2] + "," + file.readline() + + start = "NODE_" + end = "_length_" + contig_num = str( + int(re.search("%s(.*)%s" % (start, end), name).group(1)) + ) + + segments = path.rstrip().split(",") + + if current_contig_num != contig_num: + my_map[node_count] = int(contig_num) + current_contig_num = contig_num + contig_names[node_count] = name.strip() + node_count += 1 + + if contig_num not in paths: + paths[contig_num] = [segments[0], segments[-1]] + + for segment in segments: + if segment not in segment_contigs: + segment_contigs[segment] = set([contig_num]) + else: + segment_contigs[segment].add(contig_num) + + name = file.readline() + path = file.readline() + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the contig paths file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + contigs_map = my_map + contigs_map_rev = my_map.inverse + + logger.info(f"Total number of contigs available: {node_count}") + + links = [] + links_map = defaultdict(set) + + ## Construct the assembly graph + # ------------------------------- + + try: + # Get links from assembly_graph_with_scaffolds.gfa + with open(assembly_graph_file) as file: + for line in file.readlines(): + line = line.strip() + + # Identify lines with link information + if line.startswith("L"): + strings = line.split("\t") + f1, f2 = strings[1] + strings[2], strings[3] + strings[4] + links_map[f1].add(f2) + links_map[f2].add(f1) + links.append( + strings[1] + strings[2] + " " + strings[3] + strings[4] + ) + + # Create graph + assembly_graph = Graph() + + # Add vertices + assembly_graph.add_vertices(node_count) + + # Create list of edges + edge_list = [] + + # Name vertices + for i in range(node_count): + assembly_graph.vs[i]["id"] = i + assembly_graph.vs[i]["label"] = str(contigs_map[i]) + + for i in range(len(paths)): + segments = paths[str(contigs_map[i])] + + start = segments[0] + start_rev = "" + + if start.endswith("+"): + start_rev = start[:-1] + "-" + else: + start_rev = start[:-1] + "+" + + end = segments[1] + end_rev = "" + + if end.endswith("+"): + end_rev = end[:-1] + "-" + else: + end_rev = end[:-1] + "+" + + new_links = [] + + if start in links_map: + new_links.extend(list(links_map[start])) + if start_rev in links_map: + new_links.extend(list(links_map[start_rev])) + if end in links_map: + new_links.extend(list(links_map[end])) + if end_rev in links_map: + new_links.extend(list(links_map[end_rev])) + + for new_link in new_links: + if new_link in segment_contigs: + for contig in segment_contigs[new_link]: + if i != contigs_map_rev[int(contig)]: + # Add edge to list of edges + edge_list.append((i, contigs_map_rev[int(contig)])) + + # Add edges to the graph + assembly_graph.add_edges(edge_list) + assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) + + except BaseException as err: + logger.error(f"Unexpected {err}") + logger.error( + "Please make sure that the correct path to the assembly graph file is provided." + ) + logger.info("Exiting GraphBin... Bye...!") + sys.exit(1) + + logger.info(f"Total number of edges in the assembly graph: {len(edge_list)}") + + return assembly_graph, contigs_map, contig_names, node_count + + +def write_output( + output_path, + prefix, + final_bins, + contigs_file, + contig_names_rev, + bins, + contig_names, + bins_list, + delimiter, + node_count, + remove_labels, + non_isolated, +): + logger.info("Writing the Final Binning result to file") + + output_bins = [] + + output_bins_path = f"{output_path}{prefix}bins/" + output_file = f"{output_path}{prefix}graphbin_output.csv" + + if not os.path.isdir(output_bins_path): + subprocess.run(f"mkdir -p {output_bins_path}", shell=True) + + bin_files = {} + + for bin_name in set(final_bins.values()): + bin_files[bin_name] = open( + f"{output_bins_path}{prefix}bin_{bin_name}.fasta", "w+" + ) + + for label, seq in MinimalFastaParser(contigs_file): + contig_num = contig_names_rev[label] + + if contig_num in final_bins: + bin_files[final_bins[contig_num]].write(f">{label}\n{seq}\n") + + # Close output files + for c in set(final_bins.values()): + bin_files[c].close() + + for b in range(len(bins)): + for contig in bins[b]: + line = [] + line.append(contig_names[contig]) + line.append(bins_list[b]) + output_bins.append(line) + + with open(output_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + for row in output_bins: + output_writer.writerow(row) + + logger.info(f"Final binning results can be found in {output_bins_path}") + + unbinned_contigs = [] + + for i in range(node_count): + if i in remove_labels or i not in non_isolated: + line = [] + line.append(contig_names[i]) + unbinned_contigs.append(line) + + if len(unbinned_contigs) != 0: + unbinned_file = f"{output_path}{prefix}graphbin_unbinned.csv" + + with open(unbinned_file, mode="w") as out_file: + output_writer = csv.writer( + out_file, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL + ) + + for row in unbinned_contigs: + output_writer.writerow(row) + + logger.info(f"Unbinned contigs can be found at {unbinned_file}") diff --git a/tests/test_arguments.py b/tests/test_arguments.py new file mode 100644 index 0000000..1d60a4f --- /dev/null +++ b/tests/test_arguments.py @@ -0,0 +1,169 @@ +import subprocess + +from pathlib import Path + +import pytest + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Gavin Huttley"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Development" + + +TEST_ROOTDIR = Path(__file__).parent +EXEC_ROOTDIR = Path(__file__).parent.parent + + +@pytest.fixture(scope="session") +def tmp_dir(tmpdir_factory): + return tmpdir_factory.mktemp("tmp") + + +@pytest.fixture(autouse=True) +def workingdir(tmp_dir, monkeypatch): + """set the working directory for all tests""" + monkeypatch.chdir(tmp_dir) + + +def exec_wrong_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE): + proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr) + out, err = proc.communicate() + if proc.returncode == 1: + return out.decode("utf8") if out is not None else None + + +def test_graphbin_assembler(tmp_dir): + """test graphbin on wrong assembler""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler spa --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_graph(tmp_dir): + """test graphbin on wrong graph file""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffold.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_contigs(tmp_dir): + """test graphbin on wrong contigs file""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contig.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_paths(tmp_dir): + """test graphbin on wrong paths file""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contig.paths" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_binned(tmp_dir): + """test graphbin on wrong binned result file""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning.csv" + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_spades_path(tmp_dir): + """test graphbin spades without paths""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_flye_path(tmp_dir): + """test graphbin on flye assembly""" + dir_name = TEST_ROOTDIR / "data" / "1Y3B_Flye" + graph = dir_name / "assembly_graph.gfa" + contigs = dir_name / "assembly.fasta" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler flye --graph {graph} --contigs {contigs} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_no_contigs(tmp_dir): + """test graphbin with no contigs""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler spades --graph {graph} --paths {paths} --binned {binned} --output {tmp_dir}" + exec_wrong_command(cmd) + + +def test_graphbin_prefix(tmp_dir): + """test graphbin with prefix""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir} --prefix test" + exec_wrong_command(cmd) + + +def test_graphbin_delimiter(tmp_dir): + """test graphbin with wrong delimiter""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + delimiter = "." + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir} --delimiter {delimiter}" + exec_wrong_command(cmd) + + +def test_graphbin_max_iteration(tmp_dir): + """test graphbin with wrong max_iteration""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + max_iteration = -10 + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir} --max_iteration {max_iteration}" + exec_wrong_command(cmd) + + +def test_graphbin_diff_threshold(tmp_dir): + """test graphbin with wrong diff_threshold""" + dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes" + graph = dir_name / "assembly_graph_with_scaffolds.gfa" + contigs = dir_name / "contigs.fasta" + paths = dir_name / "contigs.paths" + binned = dir_name / "initial_binning_res.csv" + diff_threshold = -10 + cmd = f"graphbin --assembler spades --graph {graph} --contigs {contigs} --paths {paths} --binned {binned} --output {tmp_dir} --max_iteration {diff_threshold}" + exec_wrong_command(cmd) diff --git a/tests/test_bidirectional_map.py b/tests/test_bidirectional_map.py new file mode 100644 index 0000000..2967574 --- /dev/null +++ b/tests/test_bidirectional_map.py @@ -0,0 +1,36 @@ +import subprocess + +from pathlib import Path + +import pytest + +from graphbin.utils.bidirectionalmap.bidirectionalmap import * + + +__author__ = "Vijini Mallawaarachchi" +__copyright__ = "Copyright 2019-2022, GraphBin Project" +__credits__ = ["Vijini Mallawaarachchi", "Gavin Huttley"] +__license__ = "BSD-3" +__version__ = "1.7.1" +__maintainer__ = "Vijini Mallawaarachchi" +__email__ = "vijini.mallawaarachchi@anu.edu.au" +__status__ = "Development" + + +def test_bidirectional_map_insert(): + try: + my_map = BidirectionalMap() + + my_map[0] = 56 + my_map[7] = 56 + + except BidirectionalError: + print("BidirectionalError") + + +def test_bidirectional_map_delete(): + my_map = BidirectionalMap() + + my_map[0] = 56 + my_map[2] = 57 + my_map._del_item(2) diff --git a/tests/test_graphbin.py b/tests/test_graphbin.py index 0f06647..ed76342 100644 --- a/tests/test_graphbin.py +++ b/tests/test_graphbin.py @@ -1,13 +1,15 @@ import subprocess + from pathlib import Path import pytest + __author__ = "Vijini Mallawaarachchi" __copyright__ = "Copyright 2019-2022, GraphBin Project" __credits__ = ["Vijini Mallawaarachchi", "Gavin Huttley"] __license__ = "BSD-3" -__version__ = "1.6.0" +__version__ = "1.7.1" __maintainer__ = "Vijini Mallawaarachchi" __email__ = "vijini.mallawaarachchi@anu.edu.au" __status__ = "Development" @@ -47,6 +49,12 @@ def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE): return out.decode("utf8") if out is not None else None +def test_graphbin_version(): + """test graphbin version""" + cmd = "graphbin --version" + exec_command(cmd) + + def test_graphbin_on_spades_dataset(tmp_dir): """test graphbin on spades assembly""" dir_name = TEST_ROOTDIR / "data" / "ESC_metaSPAdes"