Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ELENAPINEIRO committed Mar 17, 2023
0 parents commit acfaaf6
Show file tree
Hide file tree
Showing 52 changed files with 19,352 additions and 0 deletions.
46 changes: 46 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# PanDrugsDB
[Pandrugs](https://pandrugs.org/) is a bioinformatics platform to prioritize anticancer drug treatments according to individual genomic data. Its database version 2.0 integrates data from 23 primary sources.

The code to generate its content and the annotations for VCF files is available in this github repository.

This workflow summarizes PanDrugsDB composition and the steps implemented for its construction.

![workflow](images/pandrugsdb-sources.png)

In this table are described the sources included in the database and in the annotation process along with versions and license.

| Source | Version/Access Date*/DOI | License | Retrieved Information |
| ------------------ | ---------------------------- | --------------------------- | --------------------------------------------------------------- |
| CIViC | v2022-07-01 | CC0 1.0 | Drug-gene associations; Drug-gene annotations |
| DGIdb | v4.2.0 | MIT | Drug-gene associations |
| DrugBank | v5.1.9 | CC BY-NC 4.0 | Drug-gene associations |
| GDSC | v8.2 | CC BY-NC-ND 2.5 | Drug-gene associations; Drug-gene annotations |
| MOAlmanac | v2022-03-03 | GPLv2 | Drug-gene associations; Drug-gene annotations |
| OncoKB | v3.14 | Free for academic use | Drug-gene associations; Drug-gene annotations; Gene annotations |
| Thera-SAbDab | 2022-06-21 | CC BY 4.0 | Drug-gene associations |
| PubChem | 2022-11-29 | Free | Drug standardisation |
| HGNC | v2022-10-01 | CC0 1.0 | Gene standardisation |
| ClinicalTrials.gov | 2022-03-15 | Free | Drug annotations |
| CLUE | v1.2; build 1.44 | CC BY 4.0 | Drug annotations |
| Drugs@FDA | 2022-03-14 | Free | Drug annotations |
| EMA | 2022-05-31 | Free | Drug annotations |
| FDA Drug Labels | 2022-06-01 | CC0 1.0 | Drug annotations |
| KEGG BRITE | v101 | Free for academic use | Drug annotations |
| COSMIC's CGC | v95 | Free for non-commercial use | Gene annotations |
| DepMap | v22Q2 | CC BY 4.0 | Gene annotations; GScore calculation |
| KEGG PATHWAY | v103 | Free for academic use | Gene annotations; Variant annotations |
| OncoVar | v1.2 | Free for non-commercial use | Gene annotations; GScore calculation |
| Cancer Hallmarks | 10.1038/s41598-018-25076-6 | CC BY 4.0 | GScore calculation |
| TDLs | 10.1016/j.celrep.2022.110400 | CC BY-NC-ND 4.0 | GScore calculation |
| ClinVar | v2022-05 | Free | Variant annotations |
| COSMIC | v96 | Free for non-commercial use | Variant annotations |
| Domains | 10.1371/journal.pcbi.1004147 | CC BY 4.0 | Variant annotations |
| InterPro | v88.0 | CC0 1.0 | Variant annotations |
| Pfam | v35.0 | CC0 1.0 | Variant annotations |
| UniProt | v2022_01 | CC BY 4.0 | Variant annotations |
| VEP | v109 | Apache-2.0 | Variant annotations |

* Dates are displayed in ISO 8601 standard format: YYYY-MM-DD.

## Synthethic letal pairs
SL pairs are generated separately from the database and then merged back to the databse. The code responsible for generating and updating these pairs is stored [here](https://github.com/cnio-bu/pandrugs_sl_pairs).
12 changes: 12 additions & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
channels:
- bu_cnio
- conda-forge
- bioconda
- defaults
dependencies:
- perl-archive-extract =0.88
- perl-dbm-deep =2.0016
- perl-exporter-tiny =1.002002
- perl-list-moreutils =0.430
- perl-parallel-forkmanager =2.02
Binary file added images/pandrugsdb-sources.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
276 changes: 276 additions & 0 deletions src/VEP_parser_DB_v20.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
##!/usr/bin/perl
use lib "modules/";
use strict;
use warnings;
use Time::HiRes;
use Switch;
use Cwd;
use File::Path;
use Net::FTP;
use Archive::Extract ();
use POSIX ();
use LWP::Simple;
use File::Copy;
use DBM::Deep;
use Parallel::ForkManager;

# Variable Initialization
my $dbdir;
my $genesids = "custom";
my $pathways = "pathway_desc.tsv";
my $logfile = "";

my ($start, $end, $time);

#Command line arguments handle
if (!@ARGV || grep (/^((\-\-help)|(\-h))$/,@ARGV)) {
&help_info;
}

for my $a (0..$#ARGV){

switch ($ARGV[$a]){

# databases path
case /^((\-\-databases=)|(\-d=))/ {
$ARGV[$a] =~ /\-(\-databases|d)=(.+)/;
$dbdir = $2 ? $2 : die "\nEmpty argument. Please enter the parameter information.\n\neg. -d=/home/epineiro/Programs/PCDA/databases\n\n";
$dbdir = $dbdir . "/";
}

else {
die "\nArgument $ARGV[$a] not valid.\n\n";
}

}

}

if (!$dbdir) {
die "\nPath to databases not indicated. Please, enter the databases path.\n\neg. -d=databases\n\n";
}

# Create folders
#mkpath($dbdir, 0);

&create_dbs;

# Start time counter
$start = Time::HiRes::gettimeofday();

$end = Time::HiRes::gettimeofday();
$time = sprintf("%.2f", $end - $start);
printl ("\nTotal time: $time seconds\n");

exit;

sub create_dbs {

# Load files into variables
print "\n\nLoading database files...\n\n";

my (%pfam_a, %interpro_a, %last_domain, %cancer_domain);

my @cosmic_files = glob("$dbdir/cosmic*.tsv");

foreach (@cosmic_files) {
my $file = $_;
$_ =~ s/.tsv/.db/;
my $cosmic_list = DBM::Deep->new($_);
print("$file\n");
open (FILE, "<$file") or die "Couldn't open file: $!";
while (<FILE>){
chomp $_;
if ($. % 100000 == 0) {print("$.\n")};
my @line = split ("\t", $_);
$cosmic_list->{$line[0]} = [$line[1], $line[2], "$line[3] / $line[5]", "$line[4] / $line[5]"];
}
close FILE;
}

my $cosmic_gene_freq = DBM::Deep->new("$dbdir/cosmic_gene_freq.db");
my $cosmic_gf_file = "$dbdir/cosmic_gene_freq.tsv";
open (FILE, "<$cosmic_gf_file") or die "Couldn't open file: $!";
while (<FILE>){
chomp $_;
my @line = split ("\t", $_);
$cosmic_gene_freq -> {$line[0]} = [$line[1], $line[2]];
}
close FILE;

print "COSMIC loaded!\n";

my $genes_ids = DBM::Deep->new("$dbdir/genesids.db");
my $genes_ids_file = $dbdir . $genesids;
open (FILE, "<$genes_ids_file") or die "Couldn't open file: $!";
while (<FILE>){
chomp $_;
my @line = split ("\t", $_);
$genes_ids -> {$line[0]} = $line[1] if ($line[1]);
}
close FILE;

print "genes IDs loaded!\n";

my $kegg_gene_pathway_DB = DBM::Deep->new("$dbdir/gene_pathway.db");
my $gene_pathway_file = $dbdir . "gene_pathway.tsv";
open FILE, "<$gene_pathway_file" or die "Couldn't open file: $!";
while (<FILE>){
chomp $_;
my @line = split("\t", $_);
$kegg_gene_pathway_DB -> {$line[0]} = $line[1];
}
close FILE;

print "gene-pathway loaded!\n";

my $pathw_desc = DBM::Deep->new("$dbdir/pathways_desc.db");
my $pathway_desc = $dbdir . $pathways;
open FILE, "<$pathway_desc" or die "Couldn't open file: $pathway_desc $!";
while (<FILE>){
chomp $_;
my @line = split("\t", $_);
$pathw_desc -> {$line[0]} = $line[1];
}
close FILE;

print "pathway description loaded!\n";

my $pfam_a = DBM::Deep->new("$dbdir/pfam.db");
my @pfam_file = glob("$dbdir/Pfam-A.full.tsv");
open FILE, "<$pfam_file[0]" or die "Couldn't open file: $!";
while (<FILE>){
chomp ($_);
my @line = split("\t", $_);
if (exists($pfam_a{$line[4]})) {
push @{$pfam_a{$line[4]}}, [$line[1], $line[2], $line[5], $line[6]];
} else {
@{$pfam_a->{$line[4]}} = [$line[1], $line[2], $line[5], $line[6]];
}
}
close FILE;

print "pfam loaded!\n";

my $uniprot_b = DBM::Deep->new("$dbdir/uniprot_b.db");
my @uniprot_file = "$dbdir/Uniprot.tsv";
open FILE, "<$uniprot_file[0]" or die "Couldn't open file: $!";
while (<FILE>) {
chomp ($_);
my @line = split("\t", $_);
my $name = $1 if ($line[0] =~ /^([A-Z0-9]+)/);
$uniprot_b->{$line[1]} = $name if ($line[1] ne "");
}
close FILE;

print "uniprot loaded!\n";

my $interpro_a = DBM::Deep->new("$dbdir/interpro_a.db");
my @interpro_file = "$dbdir/Interpro.tsv";
my $last_domain = DBM::Deep->new("$dbdir/last_domain.db");
open FILE, "<$interpro_file[0]" or die "Couldn't open file: $!";
while (<FILE>) {
chomp ($_);
my @line = split ("\t",$_);
if (exists($interpro_a{$line[3]})) {
push @{$interpro_a{$line[3]}}, [$line[0], $line[1], $line[4], $line[5]];
} else {
@{$interpro_a->{$line[3]}} = [$line[0], $line[1], $line[4], $line[5]];
}
if (exists($last_domain{$line[3]})) {
if ($last_domain{$line[3]} < $line[4]) {
$last_domain{$line[3]} = $line[4];
}
} else {
$last_domain->{$line[3]} = $line[4];
}
}
close FILE;

print "interpro loaded!\n";

my $oncorole = DBM::Deep->new("$dbdir/generole.db");
open FILE, "<$dbdir/generole.tsv" or die "Couldn't open file: $!";
my %pos;
while (<FILE>) {
chomp ($_);
my @line = split ("\t",$_);
if ($_ =~ /^gene/) {
for my $i (0..$#line) {
$pos{$i} = $line[$i];
}
} else {
my @roles = @line[1..$#line];
my @role_list;
for my $i (0 .. $#roles) {
my $role = $roles[$i];
if ($role ne "") {
push(@role_list,"$pos{$i+1}:$role");
}
}
$oncorole->{$line[0]} = join ("; ", @role_list);
}
}
close FILE;
print "Gene Role loaded!\n";

my $gscore = DBM::Deep->new("$dbdir/gscore.db");
open (ESSEN, "$dbdir/gscore_Ene_2023.tsv");
while (<ESSEN>) {
chomp $_;
my @line = split ("\t", $_);
unless ($line[0] eq "checked_gene_symbol") {
$gscore->{$line[0]} = $line[1] ;
}
}
close ESSEN;
print "gscores loaded!\n";

my $cancer_domain = DBM::Deep->new("$dbdir/cancer_domain.db");
open DOM, "<$dbdir/domains.tsv" or die "Couldn't open file: $!";
while (<DOM>){
chomp ($_);
my @line = split ("\t",$_);
unless (exists($cancer_domain{$line[4]})) {

$cancer_domain->{$line[4]} = "";
}
}
close DOM;

print "cancer domains loaded!\n";

my $clinvar = DBM::Deep->new("$dbdir/clinvar.db");
my @clinvar_file = "$dbdir/Clinvar.tsv";
open CLINVAR, "<$clinvar_file[0]" or die "Couldn't open file: $!";
while (<CLINVAR>) {
chomp $_;
my @line = split ("\t", $_);
if ($line[1] eq "GRCh38") {
if (exists($clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"})) {
@{$clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"}}[0] .= "; $line[7]";
@{$clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"}}[1] .= "; $line[0]";
@{$clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"}}[2] .= "; $line[8]";
} else {
$clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"} = [$line[7], $line[0], $line[8]];
}
}
}
close CLINVAR;

print "clinvar loaded!\n";
}

sub printl {
$logfile = $logfile . $_[0];
print $_[0];
}

sub help_info {

print "--databases=directory or -d=directory \t\t\t Absolute path to databases directory. Mandatory.\n\n";

print "\ni.e. VEP_parser.pl -d=/home/epineiro/Programs/PCDA/databases\n\n";
exit;

}
Loading

0 comments on commit acfaaf6

Please sign in to comment.