diff --git a/README.md b/README.md new file mode 100644 index 0000000..aac9ff7 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# PanDrugsDB +[Pandrugs](https://pandrugs.org/) is a bioinformatics platform to prioritize anticancer drug treatments according to individual genomic data. Its database version 2.0 integrates data from 23 primary sources. + +The code to generate its content and the annotations for VCF files is available in this github repository. + +This workflow summarizes PanDrugsDB composition and the steps implemented for its construction. + +![workflow](images/pandrugsdb-sources.png) + +In this table are described the sources included in the database and in the annotation process along with versions and license. + +| Source | Version/Access Date*/DOI | License | Retrieved Information | +| ------------------ | ---------------------------- | --------------------------- | --------------------------------------------------------------- | +| CIViC | v2022-07-01 | CC0 1.0 | Drug-gene associations; Drug-gene annotations | +| DGIdb | v4.2.0 | MIT | Drug-gene associations | +| DrugBank | v5.1.9 | CC BY-NC 4.0 | Drug-gene associations | +| GDSC | v8.2 | CC BY-NC-ND 2.5 | Drug-gene associations; Drug-gene annotations | +| MOAlmanac | v2022-03-03 | GPLv2 | Drug-gene associations; Drug-gene annotations | +| OncoKB | v3.14 | Free for academic use | Drug-gene associations; Drug-gene annotations; Gene annotations | +| Thera-SAbDab | 2022-06-21 | CC BY 4.0 | Drug-gene associations | +| PubChem | 2022-11-29 | Free | Drug standardisation | +| HGNC | v2022-10-01 | CC0 1.0 | Gene standardisation | +| ClinicalTrials.gov | 2022-03-15 | Free | Drug annotations | +| CLUE | v1.2; build 1.44 | CC BY 4.0 | Drug annotations | +| Drugs@FDA | 2022-03-14 | Free | Drug annotations | +| EMA | 2022-05-31 | Free | Drug annotations | +| FDA Drug Labels | 2022-06-01 | CC0 1.0 | Drug annotations | +| KEGG BRITE | v101 | Free for academic use | Drug annotations | +| COSMIC's CGC | v95 | Free for non-commercial use | Gene annotations | +| DepMap | v22Q2 | CC BY 4.0 | Gene annotations; GScore calculation | +| KEGG PATHWAY | v103 | Free for academic use | Gene annotations; Variant annotations | +| OncoVar | v1.2 | Free for non-commercial use | Gene annotations; GScore calculation | +| Cancer Hallmarks | 10.1038/s41598-018-25076-6 | CC BY 4.0 | GScore calculation | +| TDLs | 10.1016/j.celrep.2022.110400 | CC BY-NC-ND 4.0 | GScore calculation | +| ClinVar | v2022-05 | Free | Variant annotations | +| COSMIC | v96 | Free for non-commercial use | Variant annotations | +| Domains | 10.1371/journal.pcbi.1004147 | CC BY 4.0 | Variant annotations | +| InterPro | v88.0 | CC0 1.0 | Variant annotations | +| Pfam | v35.0 | CC0 1.0 | Variant annotations | +| UniProt | v2022_01 | CC BY 4.0 | Variant annotations | +| VEP | v109 | Apache-2.0 | Variant annotations | + +* Dates are displayed in ISO 8601 standard format: YYYY-MM-DD. + +## Synthethic letal pairs +SL pairs are generated separately from the database and then merged back to the databse. The code responsible for generating and updating these pairs is stored [here](https://github.com/cnio-bu/pandrugs_sl_pairs). diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..91ae824 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,12 @@ +--- +channels: + - bu_cnio + - conda-forge + - bioconda + - defaults +dependencies: + - perl-archive-extract =0.88 + - perl-dbm-deep =2.0016 + - perl-exporter-tiny =1.002002 + - perl-list-moreutils =0.430 + - perl-parallel-forkmanager =2.02 diff --git a/images/pandrugsdb-sources.png b/images/pandrugsdb-sources.png new file mode 100644 index 0000000..9937ab0 Binary files /dev/null and b/images/pandrugsdb-sources.png differ diff --git a/src/VEP_parser_DB_v20.pl b/src/VEP_parser_DB_v20.pl new file mode 100644 index 0000000..9391667 --- /dev/null +++ b/src/VEP_parser_DB_v20.pl @@ -0,0 +1,276 @@ +##!/usr/bin/perl +use lib "modules/"; +use strict; +use warnings; +use Time::HiRes; +use Switch; +use Cwd; +use File::Path; +use Net::FTP; +use Archive::Extract (); +use POSIX (); +use LWP::Simple; +use File::Copy; +use DBM::Deep; +use Parallel::ForkManager; + +# Variable Initialization +my $dbdir; +my $genesids = "custom"; +my $pathways = "pathway_desc.tsv"; +my $logfile = ""; + +my ($start, $end, $time); + +#Command line arguments handle +if (!@ARGV || grep (/^((\-\-help)|(\-h))$/,@ARGV)) { + &help_info; +} + +for my $a (0..$#ARGV){ + + switch ($ARGV[$a]){ + + # databases path + case /^((\-\-databases=)|(\-d=))/ { + $ARGV[$a] =~ /\-(\-databases|d)=(.+)/; + $dbdir = $2 ? $2 : die "\nEmpty argument. Please enter the parameter information.\n\neg. -d=/home/epineiro/Programs/PCDA/databases\n\n"; + $dbdir = $dbdir . "/"; + } + + else { + die "\nArgument $ARGV[$a] not valid.\n\n"; + } + + } + +} + +if (!$dbdir) { + die "\nPath to databases not indicated. Please, enter the databases path.\n\neg. -d=databases\n\n"; +} + +# Create folders +#mkpath($dbdir, 0); + +&create_dbs; + +# Start time counter +$start = Time::HiRes::gettimeofday(); + +$end = Time::HiRes::gettimeofday(); +$time = sprintf("%.2f", $end - $start); +printl ("\nTotal time: $time seconds\n"); + +exit; + +sub create_dbs { + +# Load files into variables + print "\n\nLoading database files...\n\n"; + + my (%pfam_a, %interpro_a, %last_domain, %cancer_domain); + + my @cosmic_files = glob("$dbdir/cosmic*.tsv"); + + foreach (@cosmic_files) { + my $file = $_; + $_ =~ s/.tsv/.db/; + my $cosmic_list = DBM::Deep->new($_); + print("$file\n"); + open (FILE, "<$file") or die "Couldn't open file: $!"; + while (){ + chomp $_; + if ($. % 100000 == 0) {print("$.\n")}; + my @line = split ("\t", $_); + $cosmic_list->{$line[0]} = [$line[1], $line[2], "$line[3] / $line[5]", "$line[4] / $line[5]"]; + } + close FILE; + } + + my $cosmic_gene_freq = DBM::Deep->new("$dbdir/cosmic_gene_freq.db"); + my $cosmic_gf_file = "$dbdir/cosmic_gene_freq.tsv"; + open (FILE, "<$cosmic_gf_file") or die "Couldn't open file: $!"; + while (){ + chomp $_; + my @line = split ("\t", $_); + $cosmic_gene_freq -> {$line[0]} = [$line[1], $line[2]]; + } + close FILE; + + print "COSMIC loaded!\n"; + + my $genes_ids = DBM::Deep->new("$dbdir/genesids.db"); + my $genes_ids_file = $dbdir . $genesids; + open (FILE, "<$genes_ids_file") or die "Couldn't open file: $!"; + while (){ + chomp $_; + my @line = split ("\t", $_); + $genes_ids -> {$line[0]} = $line[1] if ($line[1]); + } + close FILE; + + print "genes IDs loaded!\n"; + + my $kegg_gene_pathway_DB = DBM::Deep->new("$dbdir/gene_pathway.db"); + my $gene_pathway_file = $dbdir . "gene_pathway.tsv"; + open FILE, "<$gene_pathway_file" or die "Couldn't open file: $!"; + while (){ + chomp $_; + my @line = split("\t", $_); + $kegg_gene_pathway_DB -> {$line[0]} = $line[1]; + } + close FILE; + + print "gene-pathway loaded!\n"; + + my $pathw_desc = DBM::Deep->new("$dbdir/pathways_desc.db"); + my $pathway_desc = $dbdir . $pathways; + open FILE, "<$pathway_desc" or die "Couldn't open file: $pathway_desc $!"; + while (){ + chomp $_; + my @line = split("\t", $_); + $pathw_desc -> {$line[0]} = $line[1]; + } + close FILE; + + print "pathway description loaded!\n"; + + my $pfam_a = DBM::Deep->new("$dbdir/pfam.db"); + my @pfam_file = glob("$dbdir/Pfam-A.full.tsv"); + open FILE, "<$pfam_file[0]" or die "Couldn't open file: $!"; + while (){ + chomp ($_); + my @line = split("\t", $_); + if (exists($pfam_a{$line[4]})) { + push @{$pfam_a{$line[4]}}, [$line[1], $line[2], $line[5], $line[6]]; + } else { + @{$pfam_a->{$line[4]}} = [$line[1], $line[2], $line[5], $line[6]]; + } + } + close FILE; + + print "pfam loaded!\n"; + + my $uniprot_b = DBM::Deep->new("$dbdir/uniprot_b.db"); + my @uniprot_file = "$dbdir/Uniprot.tsv"; + open FILE, "<$uniprot_file[0]" or die "Couldn't open file: $!"; + while () { + chomp ($_); + my @line = split("\t", $_); + my $name = $1 if ($line[0] =~ /^([A-Z0-9]+)/); + $uniprot_b->{$line[1]} = $name if ($line[1] ne ""); + } + close FILE; + + print "uniprot loaded!\n"; + + my $interpro_a = DBM::Deep->new("$dbdir/interpro_a.db"); + my @interpro_file = "$dbdir/Interpro.tsv"; + my $last_domain = DBM::Deep->new("$dbdir/last_domain.db"); + open FILE, "<$interpro_file[0]" or die "Couldn't open file: $!"; + while () { + chomp ($_); + my @line = split ("\t",$_); + if (exists($interpro_a{$line[3]})) { + push @{$interpro_a{$line[3]}}, [$line[0], $line[1], $line[4], $line[5]]; + } else { + @{$interpro_a->{$line[3]}} = [$line[0], $line[1], $line[4], $line[5]]; + } + if (exists($last_domain{$line[3]})) { + if ($last_domain{$line[3]} < $line[4]) { + $last_domain{$line[3]} = $line[4]; + } + } else { + $last_domain->{$line[3]} = $line[4]; + } + } + close FILE; + + print "interpro loaded!\n"; + + my $oncorole = DBM::Deep->new("$dbdir/generole.db"); + open FILE, "<$dbdir/generole.tsv" or die "Couldn't open file: $!"; + my %pos; + while () { + chomp ($_); + my @line = split ("\t",$_); + if ($_ =~ /^gene/) { + for my $i (0..$#line) { + $pos{$i} = $line[$i]; + } + } else { + my @roles = @line[1..$#line]; + my @role_list; + for my $i (0 .. $#roles) { + my $role = $roles[$i]; + if ($role ne "") { + push(@role_list,"$pos{$i+1}:$role"); + } + } + $oncorole->{$line[0]} = join ("; ", @role_list); + } + } + close FILE; + print "Gene Role loaded!\n"; + + my $gscore = DBM::Deep->new("$dbdir/gscore.db"); + open (ESSEN, "$dbdir/gscore_Ene_2023.tsv"); + while () { + chomp $_; + my @line = split ("\t", $_); + unless ($line[0] eq "checked_gene_symbol") { + $gscore->{$line[0]} = $line[1] ; + } + } + close ESSEN; + print "gscores loaded!\n"; + + my $cancer_domain = DBM::Deep->new("$dbdir/cancer_domain.db"); + open DOM, "<$dbdir/domains.tsv" or die "Couldn't open file: $!"; + while (){ + chomp ($_); + my @line = split ("\t",$_); + unless (exists($cancer_domain{$line[4]})) { + + $cancer_domain->{$line[4]} = ""; + } + } + close DOM; + + print "cancer domains loaded!\n"; + + my $clinvar = DBM::Deep->new("$dbdir/clinvar.db"); + my @clinvar_file = "$dbdir/Clinvar.tsv"; + open CLINVAR, "<$clinvar_file[0]" or die "Couldn't open file: $!"; + while () { + chomp $_; + my @line = split ("\t", $_); + if ($line[1] eq "GRCh38") { + if (exists($clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"})) { + @{$clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"}}[0] .= "; $line[7]"; + @{$clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"}}[1] .= "; $line[0]"; + @{$clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"}}[2] .= "; $line[8]"; + } else { + $clinvar->{"$line[2]:$line[3]:$line[4]:$line[5]"} = [$line[7], $line[0], $line[8]]; + } + } + } + close CLINVAR; + + print "clinvar loaded!\n"; +} + +sub printl { + $logfile = $logfile . $_[0]; + print $_[0]; +} + +sub help_info { + + print "--databases=directory or -d=directory \t\t\t Absolute path to databases directory. Mandatory.\n\n"; + + print "\ni.e. VEP_parser.pl -d=/home/epineiro/Programs/PCDA/databases\n\n"; + exit; + +} diff --git a/src/VEP_parser_v20_PD.pl b/src/VEP_parser_v20_PD.pl new file mode 100644 index 0000000..765a0cf --- /dev/null +++ b/src/VEP_parser_v20_PD.pl @@ -0,0 +1,1037 @@ +##!/usr/bin/perl +use lib "modules/"; +use strict; +use warnings; +use Time::HiRes; +use Switch; +use Cwd; +use File::Path; +use Net::FTP; +use POSIX (); +use LWP::Simple; +use File::Copy; +use DBM::Deep; +use Parallel::ForkManager; +use List::MoreUtils qw(uniq); + +# Variable Initialization +my $outdir = "./output/"; +my $dbdir; +my $outpath = getcwd; +my (%cosmic_list0, %cosmic_list1, %cosmic_list2, %cosmic_list3, %cosmic_list4, %cosmic_list5, %cosmic_list6, %cosmic_list7, %cosmic_list8, %cosmic_list9, %cosmic_list10, %cosmic_list11, %cosmic_list12, %cosmic_list13, %cosmic_list14, %cosmic_list15, %cosmic_list16, %cosmic_list17, %cosmic_list18, %cosmic_list19, %cosmic_list20, %cosmic_list21, %cosmic_list22, %cosmic_list23, %cosmic_list24, %cosmic_list25, %cosmic_gene_freq); +my ($cosmic_list0, $cosmic_list1, $cosmic_list2, $cosmic_list3, $cosmic_list4, $cosmic_list5, $cosmic_list6, $cosmic_list7, $cosmic_list8, $cosmic_list9, $cosmic_list10, $cosmic_list11, $cosmic_list12, $cosmic_list13, $cosmic_list14, $cosmic_list15, $cosmic_list16, $cosmic_list17, $cosmic_list18, $cosmic_list19, $cosmic_list20, $cosmic_list21, $cosmic_list22, $cosmic_list23, $cosmic_list24, $cosmic_list25, $cosmic_gene_freq); +my $vep_results_file; +my $logfile = ""; +my %pathw_desc; +my $pathw_desc; +my %genes_ids; +my $genes_ids; +my $root_name = ""; +my $jobid; +my $genes_affected; + +my $conseq_file = 1; +my $MAX_PROCESSES = 8; + +my %kegg_gene_pathway_DB; +my $kegg_gene_pathway_DB; + +my ($start, $end, $time); +my (%pfam_a, %uniprot_b, %interpro_a); +my ($pfam_a, $uniprot_b, $interpro_a); + +my %zygosity; +my %last_domain; +my %gscore; +my %cancer_domain; +my %clinvar; +my %appris; + +my $generole; +my $last_domain; +my $gscore; +my $cancer_domain; +my $clinvar; +my $appris; + +#Command line arguments handle +if (!@ARGV || grep (/^((\-\-help)|(\-h))$/,@ARGV)) { + &help_info; +} + +for my $a (0..$#ARGV){ + + switch ($ARGV[$a]){ + + #Input file + case /^((\-\-vepfile=)|(\-f=))/ { + $ARGV[$a] =~ /\-(\-vepfile|f)=(.+)/; + $vep_results_file = $2 ? $2 : die "\nEmpty argument. Please enter the parameter information.\n\neg. -f=file.vcf\n\n"; + my @vep_results_file_tmp = glob ("$2"); + $vep_results_file = $vep_results_file_tmp[0]; + } + + # Output file + case /^((\-\-output=)|(\-o=))/ { + $ARGV[$a] =~ /\-(\-output|o)=(.+)/; + $outdir = $2 ? $2 : die "\nEmpty argument. Please enter the parameter information.\n\neg. -o=/home/user/z13-222\n\n"; + $outpath = ""; + } + + # Root for the file + case /^((\-\-root=)|(\-r=))/{ + $ARGV[$a] =~ /\-(\-root|r)=(.+)/; + if ($2) { + $root_name = $2 . "_"; + } + else { + die "\nEmpty argument. Please enter the parameter information.\n\neg. -r=analysis\n\n"; + } + } + + # JobID + case /^((\-\-int=)|(\-i=))/ { + $ARGV[$a] =~ /\-(\-int|i)=(.+)/; + $jobid = $2 ? $2 : die "\nEmpty argument. Please enter the parameter information.\n\neg. -i=20140213_000000\n\n"; + } + + # databases path + case /^((\-\-databases=)|(\-d=))/ { + $ARGV[$a] =~ /\-(\-databases|d)=(.+)/; + $dbdir = $2 ? $2 : die "\nEmpty argument. Please enter the parameter information.\n\neg. -d=/home/epineiro/Programs/PCDA/databases\n\n"; + $dbdir = $dbdir . "/"; + } + + # Fork processes + case /^(\-\-forkp=)/ { + $ARGV[$a] =~ /\-(\-forkp)=(.+)/; + $MAX_PROCESSES = $2 ? $2 : die "\nEmpty argument. Please enter the parameter information.\n\neg. --forkp=8\n\n"; + } + else { + die "\nArgument $ARGV[$a] not valid.\n\n"; + } + + } + +} + +if (!$vep_results_file) { + die "\nVEP file not indicated. Please enter the absolute path to VEP sorted output file.\n\neg. -f=z13-222\n\n"; +} +if (!$dbdir) { + die "\nPath to databases not indicated. Please, enter the databases path.\n\neg. -d=databases\n\n"; +} + +# Create folders +mkpath($dbdir, 0); + +# Start time counter +$start = Time::HiRes::gettimeofday(); + +# get new experiment id +if ($jobid) { + $jobid = $root_name . $jobid . "_VEP"; +} +else { + $jobid = &get_runid; +} +my $outexp = $outdir . "/" . $jobid . "/"; + +# Create experiment folder in output folder +mkpath($outexp); + +# Call VEP main subroutine +&VEP_Parser_Csv; + +$end = Time::HiRes::gettimeofday(); +$time = sprintf("%.2f", $end - $start); +printl ("\nTotal time: $time seconds\n"); + +# Log file creation +open LOGFILE, ">$outpath$jobid" . ".log" or die $!; +print LOGFILE $logfile; +close LOGFILE; + +exit; + +sub load_vars2 { +#Load files into variables + + $cosmic_list0 = DBM::Deep->new("$dbdir/cosmic00.db"); + $cosmic_list1 = DBM::Deep->new("$dbdir/cosmic01.db"); + $cosmic_list2 = DBM::Deep->new("$dbdir/cosmic02.db"); + $cosmic_list3 = DBM::Deep->new("$dbdir/cosmic03.db"); + $cosmic_list4 = DBM::Deep->new("$dbdir/cosmic04.db"); + $cosmic_list5 = DBM::Deep->new("$dbdir/cosmic05.db"); + $cosmic_list6 = DBM::Deep->new("$dbdir/cosmic06.db"); + $cosmic_list7 = DBM::Deep->new("$dbdir/cosmic07.db"); + $cosmic_list8 = DBM::Deep->new("$dbdir/cosmic08.db"); + $cosmic_list9 = DBM::Deep->new("$dbdir/cosmic09.db"); + $cosmic_list10 = DBM::Deep->new("$dbdir/cosmic10.db"); + $cosmic_list11 = DBM::Deep->new("$dbdir/cosmic11.db"); + $cosmic_list12 = DBM::Deep->new("$dbdir/cosmic12.db"); + $cosmic_list13 = DBM::Deep->new("$dbdir/cosmic13.db"); + $cosmic_list14 = DBM::Deep->new("$dbdir/cosmic14.db"); + $cosmic_list15 = DBM::Deep->new("$dbdir/cosmic15.db"); + $cosmic_list16 = DBM::Deep->new("$dbdir/cosmic16.db"); + $cosmic_list17 = DBM::Deep->new("$dbdir/cosmic17.db"); + $cosmic_list18 = DBM::Deep->new("$dbdir/cosmic18.db"); + $cosmic_list19 = DBM::Deep->new("$dbdir/cosmic19.db"); + $cosmic_list20 = DBM::Deep->new("$dbdir/cosmic20.db"); + $cosmic_list21 = DBM::Deep->new("$dbdir/cosmic21.db"); + $cosmic_list22 = DBM::Deep->new("$dbdir/cosmic22.db"); + $cosmic_list23 = DBM::Deep->new("$dbdir/cosmic23.db"); + $cosmic_list24 = DBM::Deep->new("$dbdir/cosmic24.db"); + $cosmic_list25 = DBM::Deep->new("$dbdir/cosmic25.db"); + $cosmic_gene_freq = DBM::Deep->new("$dbdir/cosmic_gene_freq.db"); + + $genes_ids = DBM::Deep->new("$dbdir/genesids.db"); + + $kegg_gene_pathway_DB = DBM::Deep->new("$dbdir/gene_pathway.db"); + + $pathw_desc = DBM::Deep->new("$dbdir/pathways_desc.db"); + + $pfam_a = DBM::Deep->new("$dbdir/pfam.db"); + + $uniprot_b = DBM::Deep->new("$dbdir/uniprot_b.db"); + + $interpro_a = DBM::Deep->new("$dbdir/interpro_a.db"); + $last_domain = DBM::Deep->new("$dbdir/last_domain.db"); + + $gscore = DBM::Deep->new("$dbdir/gscore.db"); + + $cancer_domain = DBM::Deep->new("$dbdir/cancer_domain.db"); + + $clinvar = DBM::Deep->new("$dbdir/clinvar.db"); + + $generole = DBM::Deep->new("$dbdir/generole.db"); + +} + +sub help_info { + + print "\n\n--vepfile=filename or -f=filename \t\t Input file containing results of VEP from Ensembl analysis. Mandatory.\n\n"; + print "--output=directory or -o=directory \t\t Execution output dir. Default ./output.\n\n"; + print "--int=jobID or -i=jobID \t\t\t Job ID code (when executing from sequencingAP). Default: Generated during execution.\n\n"; + print "--databases=directory or -d=directory \t\t\t Absolute path to databases directory. Mandatory.\n\n"; + print "--forkp \t\t\t\t\t\t Number of forks to improve runtime. Default: 4.\n\n"; + + print "\ni.e. VEP_parser.pl -f=file.vcf -o=/home/user/z13-222 -r=analysis -i=20140213_000000 -d=/home/epineiro/Programs/PCDA/databases --forkp=8\n\n"; + exit; + +} + +sub get_runid { + + my $timestamp = POSIX::strftime("%Y%m%d_%H%M%S", localtime); + return $root_name . "$timestamp" . "_VEP"; + +} + +sub VEP_Parser_Csv($$) {#Require a DB_conection and source data file + + # Main process + # Variable inti + my $data = ''; + + # Open vcf file + open (RFILE, "gunzip -c $vep_results_file |") || die "Could not find file $vep_results_file"; + printl ("\nProcessing file $vep_results_file...\n"); + my @rfile = ; + close RFILE; + + my %pos; + + my $count = 0; + + foreach my $i (0..$#rfile) { + + # Remove lines with ## & # to be able to handle file as csv + $rfile[$i] =~ s/\r/\n/g; + + if ($rfile[$i] =~ /^#[^#]/) { + $rfile[$i] =~ s/#//; + $data = $data . $rfile[$i]; + } + elsif ($rfile[$i] =~ /^[^#]/) { + $data = $data . $rfile[$i]; + $count++; + } + elsif ($rfile[$i] =~ /^##INFO=/) { + my @vep_fields = split ('\|', $1); + foreach my $i (0..$#vep_fields) { + $pos{Consequence} = $i if ($vep_fields[$i] eq "Consequence"); + $pos{Impact} = $i if ($vep_fields[$i] eq "IMPACT"); + $pos{Existing_variation} = $i if ($vep_fields[$i] eq "Existing_variation"); + $pos{Feature} = $i if ($vep_fields[$i] eq "Feature"); + $pos{PolyPhen} = $i if ($vep_fields[$i] eq "PolyPhen"); + $pos{SIFT} = $i if ($vep_fields[$i] eq "SIFT"); + $pos{CADD_PHRED} = $i if ($vep_fields[$i] eq "CADD_PHRED"); + $pos{CADD_RAW} = $i if ($vep_fields[$i] eq "CADD_RAW"); + $pos{SYMBOL} = $i if ($vep_fields[$i] eq "SYMBOL"); + $pos{Protein_position} = $i if ($vep_fields[$i] eq "Protein_position"); + $pos{Amino_acids} = $i if ($vep_fields[$i] eq "Amino_acids"); + $pos{HGVSc} = $i if ($vep_fields[$i] eq "HGVSc"); + $pos{HGVSp} = $i if ($vep_fields[$i] eq "HGVSp"); + $pos{GMAF} = $i if ($vep_fields[$i] eq "AF"); + $pos{CDS_position} = $i if ($vep_fields[$i] eq "CDS_position"); + $pos{Allele} = $i if ($vep_fields[$i] eq "Allele"); + $pos{Gene} = $i if ($vep_fields[$i] eq "Gene"); + $pos{Feature_type} = $i if ($vep_fields[$i] eq "Feature_type"); + $pos{cDNA_position} = $i if ($vep_fields[$i] eq "cDNA_position"); + $pos{Codons} = $i if ($vep_fields[$i] eq "Codons"); + $pos{VARIANT_CLASS} = $i if ($vep_fields[$i] eq "VARIANT_CLASS"); + $pos{gnomADe} = $i if ($vep_fields[$i] eq "gnomADe_AF"); + $pos{gnomADe_NFE} = $i if ($vep_fields[$i] eq "gnomADe_NFE_AF"); + $pos{EXON} = $i if ($vep_fields[$i] eq "EXON"); + $pos{APPRIS} = $i if ($vep_fields[$i] eq "APPRIS"); + + } + + } else { + + } + + } + + # Save modifications to ensembl_vep.csv + my $outexp_path = $outexp; + $outexp_path =~ s/^\.//g; + + $outpath .= $outexp_path; + my $outpathfile = $outpath . "ensembl_vep.csv"; + + open (SFILE, ">$outpathfile") || die "Could not save temp file\n"; + print SFILE $data; + close SFILE; + + printl ("\nensembl_vep.csv file created!\n"); + + my $sth; + + my @last_gene = ("","","",""); + + print "\n\nCreating annotations for $count variants...\n"; + + open (INPUT, "$outexp/ensembl_vep.csv"); + my $ensemblhead = ""; + my $chr = ""; + my $filechr = ""; + my @chr_names = (); + while () { + if ($_ =~ /^CHROM/) { + $ensemblhead = $_; + } else { + my @line = split ("\t", $_); + if ($line[0] ne $chr) { + $chr = $line[0]; + push (@chr_names, $chr); + if ($filechr ne "") {close $filechr} + $filechr = "$outexp/ensembl_vep_$chr.csv"; + open (OUTPUTCHR, ">$filechr"); + print OUTPUTCHR $ensemblhead; + print OUTPUTCHR $_; + } else { + print OUTPUTCHR $_; + } + } + } + close INPUT; + + my $pm = Parallel::ForkManager->new($MAX_PROCESSES); + + DATA_LOOP: + foreach $chr(@chr_names) { + print "Processing chromosome $chr...\n"; + &load_vars2; + my $pid = $pm->start and next DATA_LOOP; + open (INPUT, "$outexp/ensembl_vep_$chr.csv"); + open (OUT, ">$outexp/vep_data_$chr.csv"); + open (OUTSORT, ">$outexp/vep_data_sorted_$chr.csv"); + + print OUT "Chr\tLoc\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\tmut\tlocation\tallele\tgene\tfeature\tfeature_type\tconsequence\timpact\tcdna_position\tcds_position\tprotein_position\tamino_acids\tcodons\texisting_variation\textra\tprincipal\tpoly_effect\tpoly_score\tsift_effect\tsift_score\tCADD_phred\tCADD_raw\tgene_hgnc\tgene_role\tcosmic_id\tKEGG_data\tKEGG_path_id\tclinvar_acc\tclinvar_disease\tclinvar_clinical_significance\tvariation_type\tHGVS_cDNA\tHGVS_protein\tGMAF\tgnomAD\tgnomAD_NFE\tpfam\tinterpro\tgene_cosmic_freq\tmut_cosmic_freq\tvscore\tbranch\tzygosity\n"; + + print OUTSORT "chr\tloc\tmut\tgene\tfeature\tfeature_type\tconsequence\timpact\tprincipal\tpoly_effect\tpoly_score\tsift_effect\tsift_score\tCADD_phred\tCADD_raw\tgene_hgnc\tgene_role\tcosmic_id\tkegg_data\tkegg_path_id\tprotein_position\tamino_acids\tclinvar_acc\tclinvar_disease\tclinvar_clinical_significance\tvariation_type\tHGVS_cDNA\tHGVS_protein\tGMAF\tgnomAD\tgnomAD_NFE\tpfam\tinterpro\tgene_cosmic_freq\tmut_cosmic_freq\tvscore\tbranch\tzygosity\n"; + + $count = 1; + while () { + unless ($_ =~ /^CHROM/) { + chomp $_; + my @line = split ("\t", $_); + + $line[0] =~ s/chr//; + $line[2] = "" if ($line[2] eq "."); + + my $VCF_pos = $line[1]; + my $VCF_ref = $line[3]; + my $VCF_alt = $line[4]; + + if ((length($line[3]) ne length($line[4])) && ($line[4] !~ /,/)) { + if (length($line[3]) > length($line[4])) { + #Puede haber una deleción interna que no se procesaría correctamente + if ($line[3] =~ /^$line[4]/) { + $line[3] =~ s/^$line[4]//; + $line[1] = $line[1] + length($line[4]); + $line[4] = "-"; + } + } else { + #Puede haber una inserción interna que no se procesaría correctamente (ej. TGCTCTACC/TATAGATCGGAAGCTCTACC) + if ($line[4] =~ /^$line[3]/) { + $line[4] =~ s/^$line[3]//; + $line[1] = $line[1] + length($line[3]); + $line[3] = "-"; + } + } + } + + $line[10] = $line[3] . "/" . $line[4]; + + $line[11] = $line[0] . ":" . $line[1]; + + if ($line[9]) { + my @gentype = split(":", $line[9]); + if ($gentype[0] eq "1/1") { + $zygosity{"$line[0]_$line[1]_$line[10]"} = "Homozygous"; + } + else { + $zygosity{"$line[0]_$line[1]_$line[10]"} = "Heterozygous"; + } + } else { + $zygosity{"$line[0]_$line[1]_$line[10]"} = ""; + } + + my @q_data; + + if ($line[7] =~ /CSQ=(.+)/) { + my @trans = split (",", $1); + foreach my $t (0..$#trans) { + my $transcript = ""; + my $pol_cons = ""; + my $pol_sco = ""; + my $sift_cons = ""; + my $sift_sco = ""; + my $CADD_phred = ""; + my $CADD_raw = ""; + my $cosmic_id = ""; + my $cosmic_fathmm = ""; + my $gene_freq = 0; + my $mut_freq = 0; + my $cosmic_total = ""; + my $kegg_data = ""; + my $kegg_ids = ""; + my $clinvar_acc = ""; + my $clinvar_dis = ""; + my $clinvar_pat = ""; + my $HGVSc = ""; + my $HGVSp = ""; + my $GMAF = ""; + my $uniprot = ""; + my $pfam = ""; + my $interpro = ""; + my $var_type = ""; + my $gnomAD = ""; + my $gnomAD_NFE = ""; + + my @fields = split ('\|', $trans[$t]); + + $fields[$pos{Consequence}] =~ s/&/,/g; + $fields[$pos{Existing_variation}] =~ s/&/,/g if ($fields[$pos{Existing_variation}]); + + if ($fields[$pos{HGVSp}] && $fields[$pos{HGVSp}] =~ /:(.+)/) { + $HGVSp = $1; + } + + if ($fields[$pos{HGVSc}] && $fields[$pos{HGVSc}] =~ /:(.+)/) { + $HGVSc = $1; + } + + if ($fields[$pos{PolyPhen}] && $fields[$pos{PolyPhen}] =~ /(\w+)\((\d+\.*\d*)\)/) { + $pol_cons = $1; + $pol_sco = $2; + } + +# elsif ($fields[$pos{Consequence}] =~ /stop_gained/ || $fields[$pos{Consequence}] =~ /frameshift_variant/) { +# $pol_cons = "inferred"; +# $pol_sco = 1; +# } + + if ($fields[$pos{SIFT}] && $fields[$pos{SIFT}] =~ /(\w+)\((\d+\.*\d*)\)/) { + $sift_cons = $1; + $sift_sco = $2; + } + +# elsif ($fields[$pos{Consequence}] =~ /stop_gained/ || $fields[$pos{Consequence}] =~ /frameshift_variant/) { +# $sift_cons = "inferred"; +# $sift_sco = 0; +# } + + $CADD_phred = $fields[$pos{CADD_PHRED}]; + $CADD_raw = $fields[$pos{CADD_RAW}]; + + if ($fields[$pos{HGVSc}]) { + ($cosmic_id, $cosmic_fathmm, $gene_freq, $mut_freq, $cosmic_total) = &chkmut_cosmic($fields[$pos{SYMBOL}], $fields[$pos{Feature}], $HGVSc); + $cosmic_id .= ":$cosmic_fathmm" if ($cosmic_fathmm ne ""); + } + + if (exists($cosmic_gene_freq->{$fields[$pos{SYMBOL}]})) { + $gene_freq = "@{$cosmic_gene_freq->{$fields[$pos{SYMBOL}]}}[0] / @{$cosmic_gene_freq->{$fields[$pos{SYMBOL}]}}[1]" + } + + if ($fields[$pos{SYMBOL}]) { + if ($fields[$pos{SYMBOL}] ne $last_gene[0]) { + #Get information about gene symbol: pathway_description, pathway_ids and entrez_gene_id + @last_gene = get_kegg_id_sym(uc($fields[$pos{SYMBOL}])); + } + $kegg_data = $last_gene[1]; + $kegg_ids = $last_gene[2]; + } + + $var_type = $fields[$pos{"VARIANT_CLASS"}]; + + if ($fields[$pos{GMAF}]) { + my $num = $fields[$pos{GMAF}]; + my @GMAF_a = split ("&", $fields[$pos{GMAF}]); + foreach my $gf (@GMAF_a) { + if ($gf ne "") { + $GMAF = $gf * 100; + } + } + } + + if (exists($clinvar->{"$line[0]:$VCF_pos:$VCF_ref:$VCF_alt"})) { + $clinvar_acc = @{$clinvar->{"$line[0]:$VCF_pos:$VCF_ref:$VCF_alt"}}[1]; + $clinvar_dis = @{$clinvar->{"$line[0]:$VCF_pos:$VCF_ref:$VCF_alt"}}[0]; + $clinvar_pat = @{$clinvar->{"$line[0]:$VCF_pos:$VCF_ref:$VCF_alt"}}[2]; + } + + my $prot_pos = $fields[$pos{Protein_position}]; + my $prot_end; + my $ident = ""; + + if (exists($uniprot_b->{$fields[$pos{SYMBOL}]})) { + $ident = $uniprot_b->{$fields[$pos{SYMBOL}]}; + + my $prot_end = 0; + + if ($fields[$pos{Protein_position}] =~ /(\d+)\-(\d+)/) { + $prot_pos = $1; + $prot_end = $2; + } + + if ($fields[$pos{Protein_position}] =~ /(\d+)\-(\?)/) { + $prot_pos = $1; + $prot_end = 0; + } + + if ($fields[$pos{Protein_position}] =~ /(\?)\-(\d+)/) { + $prot_pos = 0; + $prot_end = $2; + } + + + if ($prot_pos ne "") { + if (exists($pfam_a->{$ident})) { + foreach my $ia (0..scalar(@{$pfam_a->{$ident}})-1) { + if (($prot_pos >= ${@{$pfam_a->{$ident}}[$ia]}[2] && $prot_pos <= ${@{$pfam_a->{$ident}}[$ia]}[3]) || ($prot_end >= ${@{$pfam_a->{$ident}}[$ia]}[2] && $prot_end <= ${@{$pfam_a->{$ident}}[$ia]}[3])) { + $pfam = "${@{$pfam_a->{$ident}}[$ia]}[0]: ${@{$pfam_a->{$ident}}[$ia]}[1]"; + } + } + } + if (exists($interpro_a->{$ident})) { + foreach my $ii (0..scalar(@{$interpro_a->{$ident}})-1) { + if (($prot_pos >= ${@{$interpro_a->{$ident}}[$ii]}[2] && $prot_pos <= ${@{$interpro_a->{$ident}}[$ii]}[3]) || ($prot_end >= ${@{$interpro_a->{$ident}}[$ii]}[2] && $prot_end <= ${@{$interpro_a->{$ident}}[$ii]}[3])) { + $interpro = "${@{$interpro_a->{$ident}}[$ii]}[0]: ${@{$interpro_a->{$ident}}[$ii]}[1]"; + } + } + } + if ($fields[$pos{Consequence}] =~ /(stop_gained|frameshift_variant)/ && $interpro eq "" && exists($last_domain->{$ident}) && $prot_pos <= $last_domain->{$ident}) { + $interpro = "Mutation previous last protein domain"; + } + } + + } + + my $gene_role = ''; + $gene_role = $generole->{$fields[$pos{SYMBOL}]} if ($generole->{$fields[$pos{SYMBOL}]}); + + if ($fields[$pos{gnomADe}]) { + my @gnomAD = split ("&", $fields[$pos{gnomADe}]); + foreach my $ex (@gnomAD) { + if ($ex ne "") { + $gnomAD = $ex * 100; + } + } + } + if ($fields[$pos{gnomADe_NFE}]) { + my @gnomAD = split ("&", $fields[$pos{gnomADe_NFE}]); + foreach my $ex (@gnomAD) { + if ($ex ne "") { + $gnomAD_NFE = $ex * 100; + } + } + } + my @q_data_line = [$line[0], $line[1], $line[2], $line[3], $line[4], $line[5], $line[6], $line[7], $line[8], $line[9], $line[10], $line[11], $fields[$pos{Allele}], $fields[$pos{Gene}], $fields[$pos{Feature}], $fields[$pos{Feature_type}], $fields[$pos{Consequence}], $fields[$pos{Impact}], $fields[$pos{cDNA_position}], $fields[$pos{CDS_position}], $fields[$pos{Protein_position}], $fields[$pos{Amino_acids}], $fields[$pos{Codons}], $fields[$pos{Existing_variation}], "", $fields[$pos{APPRIS}], $pol_cons, $pol_sco, $sift_cons, $sift_sco, $CADD_phred, $CADD_raw, $fields[$pos{SYMBOL}], $gene_role, $cosmic_id, $kegg_data, $kegg_ids, $clinvar_acc, $clinvar_dis, $clinvar_pat, $var_type, $HGVSc, $HGVSp, $GMAF, $gnomAD, $gnomAD_NFE, $pfam, $interpro, $gene_freq, $mut_freq]; + + @q_data_line = &create_vscore (@q_data_line); + + no warnings 'uninitialized'; + push @q_data, @q_data_line; + print OUT join("\t", @{$q_data[scalar(@q_data)-1]}), "\n"; + } + $count++; + } + + @q_data = sort { $a->[0] cmp $b->[0] || $a->[1] cmp $b->[1] || $a->[10] cmp $b->[10] || $a->[13] cmp $b->[13] || $a->[42] cmp $b->[42] || $a->[14] cmp $b->[14]} @q_data; + + foreach my $qi (0..$#q_data) { + no warnings 'uninitialized'; + if ($conseq_file == 1) { + if ($q_data[$qi][17] =~ /HIGH|MODERATE/) { + print OUTSORT "$q_data[$qi][0]\t$q_data[$qi][1]\t$q_data[$qi][10]\t$q_data[$qi][13]\t$q_data[$qi][14]\t$q_data[$qi][15]\t$q_data[$qi][16]\t$q_data[$qi][17]\t$q_data[$qi][25]\t$q_data[$qi][26]\t$q_data[$qi][27]\t$q_data[$qi][28]\t$q_data[$qi][29]\t$q_data[$qi][30]\t$q_data[$qi][31]\t$q_data[$qi][32]\t$q_data[$qi][33]\t$q_data[$qi][34]\t$q_data[$qi][35]\t$q_data[$qi][36]\t$q_data[$qi][20]\t$q_data[$qi][21]\t$q_data[$qi][37]\t$q_data[$qi][38]\t$q_data[$qi][39]\t$q_data[$qi][40]\t$q_data[$qi][41]\t$q_data[$qi][42]\t$q_data[$qi][43]\t$q_data[$qi][44]\t$q_data[$qi][45]\t$q_data[$qi][46]\t$q_data[$qi][47]\t$q_data[$qi][48]\t$q_data[$qi][49]\t$q_data[$qi][50]\t$q_data[$qi][51]\t$q_data[$qi][52]\t$q_data[$qi][53]\t$q_data[$qi][54]\t$q_data[$qi][55]\t$q_data[$qi][56]\t$q_data[$qi][57]\t$q_data[$qi][58]\t$q_data[$qi][59]\t$q_data[$qi][60]\t$q_data[$qi][61]\n"; + } + } else { + print OUTSORT "$q_data[$qi][0]\t$q_data[$qi][1]\t$q_data[$qi][10]\t$q_data[$qi][13]\t$q_data[$qi][14]\t$q_data[$qi][15]\t$q_data[$qi][16]\t$q_data[$qi][17]\t$q_data[$qi][25]\t$q_data[$qi][26]\t$q_data[$qi][27]\t$q_data[$qi][28]\t$q_data[$qi][29]\t$q_data[$qi][30]\t$q_data[$qi][31]\t$q_data[$qi][32]\t$q_data[$qi][33]\t$q_data[$qi][34]\t$q_data[$qi][35]\t$q_data[$qi][36]\t$q_data[$qi][37]\t$q_data[$qi][20]\t$q_data[$qi][21]\t$q_data[$qi][38]\t$q_data[$qi][39]\t$q_data[$qi][40]\t$q_data[$qi][41]\t$q_data[$qi][42]\t$q_data[$qi][43]\t$q_data[$qi][44]\t$q_data[$qi][45]\t$q_data[$qi][46]\t$q_data[$qi][47]\t$q_data[$qi][48]\t$q_data[$qi][49]\t$q_data[$qi][50]\t$q_data[$qi][51]\t$q_data[$qi][52]\t$q_data[$qi][53]\t$q_data[$qi][54]\t$q_data[$qi][55]\t$q_data[$qi][56]\t$q_data[$qi][57]\t$q_data[$qi][58]\t$q_data[$qi][59]\t$q_data[$qi][60]\t$q_data[$qi][61]\n"; + } + } + } + } + close INPUT; + close OUT; + close OUTSORT; + print "Chromosome $chr processed!\n"; + $pm->finish; # Terminates the child process + } + $pm->wait_all_children; + open (GREATOUT, ">$outexp/vep_data.csv"); + print GREATOUT "Chr\tLoc\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample\tmut\tlocation\tallele\tgene\tfeature\tfeature_type\tconsequence\timpact\tcdna_position\tcds_position\tprotein_position\tamino_acids\tcodons\texisting_variation\textra\tprincipal\tpoly_effect\tpoly_score\tsift_effect\tsift_score\tCADD_phred\tCADD_raw\tgene_hgnc\tgene_role\tcosmic_id\tKEGG_data\tKEGG_path_id\tclinvar_acc\tclinvar_disease\tclinvar_clinical_significance\tvariation_type\tHGVS_cDNA\tHGVS_protein\tGMAF\tgnomAD\tgnomAD_NFE\tpfam\tinterpro\tgene_cosmic_freq\tmut_cosmic_freq\tvscore\tbranch\n"; + + foreach $chr (@chr_names) { + open (INPUT, "$outexp/vep_data_$chr.csv"); + while () { + unless ($_ =~ /^Chr/) { + print GREATOUT $_; + } + } + close INPUT; + unlink ("$outexp/vep_data_$chr.csv"); + unlink ("$outexp/ensembl_vep_$chr.csv"); + } + close GREATOUT; + + open (GREATOUT, ">$outexp/vep_data_sorted.csv"); + print GREATOUT "chr\tloc\tmut\tgene\tfeature\tfeature_type\tconsequence\timpact\tprincipal\tpoly_effect\tpoly_score\tsift_effect\tsift_score\tCADD_phred\tCADD_raw\tgene_hgnc\tgene_role\tcosmic_id\tkegg_data\tkegg_path_id\tprotein_position\tamino_acids\tclinvar_acc\tclinvar_disease\tclinvar_clinical_significance\tvariation_type\tHGVS_cDNA\tHGVS_protein\tGMAF\tgnomAD\tgnomAD_NFE\tpfam\tinterpro\tgene_cosmic_freq\tmut_cosmic_freq\tvscore\tbranch\n"; + + foreach $chr (@chr_names) { + open (INPUT, "$outexp/vep_data_sorted_$chr.csv"); + while () { + unless ($_ =~ /^chr/) { + print GREATOUT $_; + } + } + close INPUT; + unlink ("$outexp/vep_data_sorted_$chr.csv") + } + close GREATOUT; + + # Save query to file + my $veptable_sort = $outpath . "vep_data_sorted.csv"; + + # Select the highest score for the principal isoform + my @vep_file; + open FILE, "<$veptable_sort" or die $!; + while () { + chomp $_; + my @line = split ("\t", $_); + push @vep_file, [@line]; + } + close FILE; + + my %isoform; + foreach my $is (1..$#vep_file) { + if ($vep_file[$is][7] =~ /HIGH|MODERATE/) { + if (exists($isoform{"$vep_file[$is][15]"})) { + $isoform{"$vep_file[$is][15]"}[0] = 1 if ($vep_file[$is][8] eq "P1"); + $isoform{"$vep_file[$is][15]"}[1] = 1 if ($vep_file[$is][8] eq "P2"); + $isoform{"$vep_file[$is][15]"}[2] = 1 if ($vep_file[$is][8] eq "P3"); + $isoform{"$vep_file[$is][15]"}[3] = 1 if ($vep_file[$is][8] eq "P4"); + $isoform{"$vep_file[$is][15]"}[4] = 1 if ($vep_file[$is][8] eq "P5"); + $isoform{"$vep_file[$is][15]"}[5] = 1 if ($vep_file[$is][8] eq "A1" || $vep_file[$is][8] eq "A2" || $vep_file[$is][8] eq ""); + } else { + $isoform{"$vep_file[$is][15]"} = [1, 0, 0, 0, 0, 0] if ($vep_file[$is][8] eq "P1"); + $isoform{"$vep_file[$is][15]"} = [0, 1, 0, 0, 0, 0] if ($vep_file[$is][8] eq "P2"); + $isoform{"$vep_file[$is][15]"} = [0, 0, 1, 0, 0, 0] if ($vep_file[$is][8] eq "P3"); + $isoform{"$vep_file[$is][15]"} = [0, 0, 0, 1, 0, 0] if ($vep_file[$is][8] eq "P4"); + $isoform{"$vep_file[$is][15]"} = [0, 0, 0, 0, 1, 0] if ($vep_file[$is][8] eq "P5"); + $isoform{"$vep_file[$is][15]"} = [0, 0, 0, 0, 0, 1] if ($vep_file[$is][8] eq "A1" || $vep_file[$is][8] eq "A2" || $vep_file[$is][8] eq ""); + } + } + } + + foreach my $key (keys (%isoform)) { + if ($isoform{$key}[0] == 1) { + $isoform{$key} = "P1"; + } elsif ($isoform{$key}[1] == 1) { + $isoform{$key} = "P2"; + } elsif ($isoform{$key}[2] == 1) { + $isoform{$key} = "P3"; + } elsif ($isoform{$key}[3] == 1) { + $isoform{$key} = "P4"; + } elsif ($isoform{$key}[4] == 1) { + $isoform{$key} = "P5"; + } else { + $isoform{$key} = "|A1|A2"; + } + } + + my %genes_affected; + my $head = ""; + foreach my $is (0..$#vep_file) { + if ($is == 0) { + $head = join("\t", @{$vep_file[$is]}); + } else { + if ($vep_file[$is][7] =~ /HIGH|MODERATE/) { + if ($vep_file[$is][8] =~ /^$isoform{$vep_file[$is][15]}$/) { + if (exists($genes_affected{$vep_file[$is][15]})) { + if ($vep_file[$is][35] >= $genes_affected{$vep_file[$is][15]}[0]) { + $genes_affected{$vep_file[$is][15]} = [$vep_file[$is][35], $vep_file[$is][36]]; + push @{$genes_affected{$vep_file[$is][15]}}, @{$vep_file[$is]}; + } + } else { + $genes_affected{$vep_file[$is][15]} = [$vep_file[$is][35], $vep_file[$is][36]]; + push @{$genes_affected{$vep_file[$is][15]}}, @{$vep_file[$is]}; + } + } + } + } + } + + $genes_affected = $outpath . "genes_affected.csv"; + + open (FILE, ">$genes_affected"); + print FILE "gene_hgnc\tmax(vscore)\tbranch\t$head\n"; + foreach my $key (keys(%genes_affected)) { + print FILE $key, "\t", join ("\t", @{$genes_affected{$key}}), "\n"; + } + close FILE; + + printl ("\ngenes_affected.csv file created!\n\n"); + +} + +sub chkmut_cosmic() { + + my ($gene, $transcript, $HGVSc) = @_; + + if (exists($cosmic_list0->{"$gene:$transcript:$HGVSc"})) { + return (@{$cosmic_list0->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list1->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list1->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list2->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list2->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list3->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list3->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list4->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list4->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list5->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list5->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list6->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list6->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list7->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list7->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list8->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list8->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list9->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list9->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list10->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list10->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list11->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list11->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list12->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list12->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list13->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list13->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list14->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list14->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list15->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list15->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list16->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list16->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list17->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list17->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list18->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list18->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list19->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list19->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list20->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list20->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list21->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list21->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list22->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list22->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list23->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list23->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list24->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list24->{"$gene:$transcript:$HGVSc"}}); + } elsif (exists($cosmic_list25->{"$gene:$transcript:$HGVSc"})){ + return (@{$cosmic_list25->{"$gene:$transcript:$HGVSc"}}); + } else { + return ("", "", 0, 0, "") + } + +} + +sub get_kegg_id_sym { + + my ($hgnc_symbol) = @_; + + if (exists($genes_ids->{$hgnc_symbol})) { + my @kegg_pathways = &get_kegg_path("$genes_ids->{$hgnc_symbol}"); + return ($hgnc_symbol,@kegg_pathways,$1); + } + else { + return ("","","",""); + } + +} + +sub get_kegg_path { + my ($kegg_id) = @_; + + if (exists($kegg_gene_pathway_DB->{$kegg_id})) { + my @kegg_path_desc = split /\|/, $kegg_gene_pathway_DB->{$kegg_id}; + return get_kegg_path_desc(\@kegg_path_desc); + } + else{ + return ("",""); + } +} + +sub get_kegg_path_desc { + my @kegg_paths = @{$_[0]}; + my $results = ""; + + foreach my $path_id (@kegg_paths) { + if (exists($pathw_desc->{$path_id})) { + $results .= "$pathw_desc->{$path_id}|"; + } + } + + my $keggpaths = join ("|",@kegg_paths); + + $results =~ s/\|$//g; + + return ($results, $keggpaths); +} + +sub create_vscore() { + + # vscore creation + my @vep_file = @_; + my $score = 0; + my $lastgene = ""; + my $skip = 0; + my %scored_columns = (); + + # Get column position to apply the score (poly_score, sift_score, cosmic_id, kegg_path_id ) + # Get Gene_hgnc column for getting sample mutation frequency + # Add column CCLE genes matches column + my @linedata = ("Chr", "Loc", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample", "mut", "location", "allele", "gene", "feature", "feature_type", "consequence", "impact", "cdna_position", "cds_position", "protein_position", "amino_acids", "codons", "existing_variation", "extra", "principal", "poly_effect", "poly_score", "sift_effect", "sift_score", "CADD_phred", "CADD_raw", "gene_hgnc", "gene_role", "cosmic_id", "KEGG_data", "KEGG_path_id", "clinvar_acc", "clinvar_disease", "clinvar_clinical_significance", "variation_type", "HGVS_cDNA", "HGVS_protein", "GMAF", "gnomAD", "gnomAD_NFE", "pfam", "interpro", "gene_freq", "mut_freq"); + + foreach my $i (0..$#linedata) { + + if ($linedata[$i] eq "Chr") { $scored_columns{Chr} = $i; } + if ($linedata[$i] eq "Loc") { $scored_columns{Loc} = $i; } + if ($linedata[$i] eq "mut") { $scored_columns{mut} = $i; } + if ($linedata[$i] eq "poly_score") { $scored_columns{poly_score} = $i; } + if ($linedata[$i] eq "sift_score") { $scored_columns{sift_score} = $i; } + if ($linedata[$i] eq "CADD_phred") { $scored_columns{CADD_phred} = $i; } + if ($linedata[$i] eq "gene_role") { $scored_columns{gene_role} = $i; } + if ($linedata[$i] eq "cosmic_id") { $scored_columns{cosmic_id} = $i; } + if ($linedata[$i] eq "protein_position") { $scored_columns{protein_position} = $i; } + if ($linedata[$i] eq "mut_freq") { $scored_columns{mut_cosmic_freq} = $i; } + if ($linedata[$i] eq "gene_freq") { $scored_columns{gene_cosmic_freq} = $i; } + + if ($linedata[$i] eq "consequence") { $scored_columns{consequence} = $i; } + if ($linedata[$i] eq "impact") { $scored_columns{impact} = $i; } + if ($linedata[$i] eq "GMAF") { $scored_columns{GMAF} = $i; } + if ($linedata[$i] eq "gnomAD") { $scored_columns{gnomADe} = $i; } + if ($linedata[$i] eq "pfam") { $scored_columns{pfam} = $i; } + if ($linedata[$i] eq "interpro") { $scored_columns{interpro} = $i; } + if ($linedata[$i] eq "clinvar_clinical_significance") { $scored_columns{clinvar} = $i; } + if ($linedata[$i] eq "INFO") {$scored_columns{INFO} = $i; } + if ($linedata[$i] eq "gene_hgnc") { $scored_columns{gene_hgnc} = $i; } + + } + + foreach my $i (0..$#vep_file) { + + @linedata = @{$vep_file[$i]}; + + if ($lastgene ne $linedata[$scored_columns{gene_hgnc}]) { + + $lastgene = $linedata[$scored_columns{gene_hgnc}]; + + } + + if ($skip == 0) { + + #Decide the branch for the calculation according to the variation or gene definition + my $branch; + + my @components = split ("; ", $linedata[$scored_columns{gene_role}]); + my @genetype = (); + foreach my $ic (0..$#components) { + if ($components[$ic] =~ /:([\w ]+)/) { + if ($1 eq "ONC" || $1 eq "TSG") { + push(@genetype, $1); + } + } + } + @genetype = uniq @genetype; + + my $genetype_string = join(":", @genetype); + + if ($genetype_string) { + if ($genetype_string eq "ONC") { + $branch = "ONC"; + } elsif ($genetype_string eq "TSG") { + $branch = "TSG"; + } else { + $branch = "UNCLASSIFIED"; + } + } else { + $branch = "UNCLASSIFIED"; + } + + if ($branch eq "UNCLASSIFIED") { + if ($linedata[$scored_columns{consequence}] =~ /stop_gain|stop_lost|frameshift_variant|splice_donor_variant|splice_acceptor_variant|splice_region_variant/) { + $branch = "TSG"; + } + } + + # Add scores + my $prediction_damaging = 0; + + #Cosmic ID + if ($linedata[$scored_columns{cosmic_id}] =~ /(COSV\d+):*/) { + if ($linedata[$scored_columns{cosmic_id}] =~ /PATHOGENIC/) { + $prediction_damaging += 1; + } + if ($linedata[$scored_columns{mut_cosmic_freq}] && $linedata[$scored_columns{mut_cosmic_freq}] =~ /(\d+) \/ (\d+)/) { + if ($branch eq "ONC" || $branch eq "UNCLASSIFIED") { + my $ss; + if ($1 >= 100) { + $ss = 0.125 / 2; + } else { + $ss = (0.125 / 2) * ((log($1) - 0) / (log($2) - 0)); + } + $score += $ss; + } + } + if ($linedata[$scored_columns{gene_cosmic_freq}] && $linedata[$scored_columns{gene_cosmic_freq}] =~ /(\d+) \/ (\d+)/) { + my $ss; + if ($1 >= 100) { + $ss = 0.125 / 2; + } else { + $ss = (0.125 / 2) * ((log($1) - 0) / (log($2) - 0)); + } + $score += $ss; + } + } + +#print "cosm:$score\n"; + #Prediction score + if ($linedata[$scored_columns{poly_score}] && $linedata[$scored_columns{poly_score}] > 0.435) { + $prediction_damaging += 1; + } + + if ($linedata[$scored_columns{sift_score}] ne "" && $linedata[$scored_columns{sift_score}] <= 0.05) { + $prediction_damaging += 1; + } + + if ($linedata[$scored_columns{CADD_phred}] && $linedata[$scored_columns{CADD_phred}] > 20) { + $prediction_damaging += 1; + } + + if ($prediction_damaging >= 3) { + $score += 0.125; + } elsif ($prediction_damaging == 2) { + $score += 0.08; + } elsif ($prediction_damaging == 1) { + $score += 0.04 + } +#print "pred:$score\n"; + #Mutation type + if ($linedata[$scored_columns{impact}] =~ /(HIGH)/) { + $score += 0.125; + } +#print "mut:$score\n"; + #Frequencies + if ($linedata[$scored_columns{GMAF}] ne "") { + if ($linedata[$scored_columns{GMAF}] < 1) { + $score += 0.125 / 2; + } + } else { + $score = ($score + 0.125 / 2); + } + if ($linedata[$scored_columns{gnomADe}] ne "") { + if ($linedata[$scored_columns{gnomADe}] < 1) { + $score += 0.125 / 2; + } + } else { + $score = ($score + 0.125 / 2); + } +#print "freq:$score\n"; + #Domains + if ($linedata[$scored_columns{pfam}] =~ /^(\w+)\./ && exists($cancer_domain->{$1})) { + $score += 0.125; + } elsif ($linedata[$scored_columns{interpro}] eq "Mutation previous last protein domain") { + $score += 0.125; + } else { + if ($linedata[$scored_columns{pfam}] || $linedata[$scored_columns{interpro}]) { + $score += 0.125 / 2; + } + } +#print "dom:$score\n"; + #Clinvar + if ($linedata[$scored_columns{clinvar}] =~ /Pathogenic/) { + if ($zygosity{"$linedata[$scored_columns{Chr}]_$linedata[$scored_columns{Loc}]_$linedata[$scored_columns{mut}]"} ne "") { + $score += 0.125; + } else { + if ($branch eq "ONC" || $branch eq "UNCLASSIFIED") { + $score += 0.250; + } else { + $score += 0.3125; + } + } + } +#print "clin:$score\n"; + #Homozigous + if ($zygosity{"$linedata[$scored_columns{Chr}]_$linedata[$scored_columns{Loc}]_$linedata[$scored_columns{mut}]"} eq "Homozygous") { + if ($branch eq "ONC" || $branch eq "UNCLASSIFIED") { + $score += 0.125; + } else { + $score += 0.1875; + } + } +#print "hom:$score\n"; + #GScore + if (exists($gscore->{$linedata[$scored_columns{gene_hgnc}]})) { + $score += (0.125 * $gscore->{$linedata[$scored_columns{gene_hgnc}]}); + } + + push @{$vep_file[$i]}, (sprintf("%.4f", $score), $branch); + + } + + $score = 0; + + } + + return @vep_file; +} + +sub printl { + $logfile = $logfile . $_[0]; + print $_[0]; +} diff --git a/src/createDB.py b/src/createDB.py new file mode 100644 index 0000000..6288e5e --- /dev/null +++ b/src/createDB.py @@ -0,0 +1,396 @@ +#!/usr/bin/python + +import re +import pdb +import pandas as pd +from datetime import date +import progressbar +import math +import sys + +if len(sys.argv) < 2: + print("python createDB.py level [being 1(create intermediate file), and 2(create final file)]") + sys.exit() +else: + print(sys.argv) + +#Directory with the downloaded files +dwn_dir = 'downloads/' +#Directory with the processed files +pro_dir = 'processed/' +#Directory with the manually reviewed files +mr_dir = 'manual_review/' +#Directory with additional files +add_dir = 'additional_files/' +#Output directory +out_dir = '2.0/' +#Output file +out_file = 'PanDrugs_'+date.today().strftime('%b_%d_%Y')+'.tsv' + +def load_files(): + global checked_gene_symbol_file, drug_names_file, family_KEGG_file, family_cmap_file, drug_approved_data_file, ind_pathway_file, gene_dependency_file, moalmanac_sen_res, gdsc_sen_res, civic_sen_res, oncokb_sen_res, gene_names, gene_pathways, cgc, intogen_group, intogen_min, intogen_max, oncovar, oncovar_min, oncovar_max, chronos, chronos_min, hallmarks + + print('Loading annotation files...') + + checked_gene_symbol_file = pd.read_csv(pro_dir+'genes_checked.tsv', sep ='\t', low_memory=False) + checked_gene_symbol_file['gene_name'] = checked_gene_symbol_file['gene_name'].str.upper() + + drug_names_file = pd.read_csv(mr_dir+'drug_synonyms_mrd.tsv', sep ='\t', low_memory=False) + drug_names_file['drug_name'] = drug_names_file['drug_name'].str.upper() + drug_names_file['standard_drug_name'] = drug_names_file['standard_drug_name'].str.upper() + drug_names_file['show_drug_name'] = drug_names_file['show_drug_name'].str.upper() + drug_names_file = drug_names_file.fillna('') + + family_KEGG_file = pd.read_csv(pro_dir+'TargetBasedClassificationKEGG_formated.tsv', sep ='\t', low_memory=False) + family_KEGG_file = family_KEGG_file.fillna('') + family_KEGG_file['drug'] = family_KEGG_file['drug'].str.upper() + family_cmap_file = pd.read_csv(pro_dir+'cmap_moa.tsv', sep ='\t', low_memory=False) + family_cmap_file = family_cmap_file.fillna('') + family_cmap_file['drug'] = family_cmap_file['drug'].str.upper() + + drug_approved_data_file = pd.read_csv(mr_dir+'drug_approved_data_mrd.tsv', sep='\t', low_memory=False) + drug_approved_data_file = drug_approved_data_file.fillna('') + + ind_pathway_file = pd.read_csv(pro_dir+'KEGGmodeled/upstream_genes.tsv', sep='\t', low_memory=False) + ind_pathway_file = ind_pathway_file.fillna('') + ind_pathway_file['gene'] = ind_pathway_file['gene'].str.upper() + + gene_dependency_file = pd.read_csv(pro_dir+'genetic_dependencies.tsv', sep='\t', low_memory=False) + gene_dependency_file = gene_dependency_file.fillna('') + + moalmanac_sen_res = pd.read_csv(pro_dir+'moalmanac.tsv', sep='\t', low_memory=False) + moalmanac_sen_res = moalmanac_sen_res.fillna('') + moalmanac_sen_res['drug_name'] = moalmanac_sen_res['drug_name'].str.upper() + gdsc_sen_res = pd.read_csv(pro_dir+'GDSC.tsv', sep='\t', low_memory=False) + gdsc_sen_res = gdsc_sen_res.fillna('') + gdsc_sen_res['drug_name'] = gdsc_sen_res['drug_name'].str.upper() + civic_sen_res = pd.read_csv(pro_dir+'civic.tsv', sep='\t', low_memory=False) + civic_sen_res = civic_sen_res.fillna('') + civic_sen_res['drug_name'] = civic_sen_res['drug_name'].str.upper() + oncokb_sen_res = pd.read_csv(pro_dir+'oncokb.tsv', sep='\t', low_memory=False) + oncokb_sen_res = oncokb_sen_res.fillna('') + oncokb_sen_res['drug_name'] = oncokb_sen_res['drug_name'].str.upper() + + gene_names = pd.read_csv(dwn_dir+'custom', sep='\t', low_memory=False, dtype=str) + gene_names = gene_names.fillna('') + gene_pathways = pd.read_csv(pro_dir+'gene_pathway.tsv', sep='\t', low_memory=False, dtype=str) + + #Files for gscore calculation + cgc = pd.read_csv(pro_dir+'cgc_scores.tsv', sep='\t', low_memory=False, dtype=str) + intogen = pd.read_csv(pro_dir+'intogen.tsv', sep='\t', low_memory=False, dtype=str) + intogen_group = intogen.groupby('gene_name')['qvalue_combination'].median().reset_index() + intogen_min = sorted(intogen_group['qvalue_combination'][intogen_group['qvalue_combination'] > 0.05].tolist())[0] + intogen_max = min(intogen_group['qvalue_combination']) + oncovar = pd.read_csv(pro_dir+'oncovar_scores.tsv', sep='\t', low_memory=False, dtype=str) + oncovar_max = float(max(oncovar[(oncovar['cancer_type'] == 'PanCancer')]['Consensus_Score'].tolist())) + chronos = pd.read_csv(pro_dir+'chronos_skew.tsv', sep='\t', low_memory=False, dtype=str, header = None) + chronos.columns = ['gene', 'value', 'min'] + chronos['value'] = chronos['value'].astype(float) + chronos_min = sorted(chronos[chronos['value'] > -0.5]['value'].tolist())[0] + hallmarks = pd.read_csv(pro_dir+'hallmarks.tsv', sep='\t', low_memory=False, dtype=str) + +def drug_gene_associations(): + + print('Processing drug-gene associations...') + #CIVIC and OncoKB records are obtained from its web page in order to restrict to robut associations and avoid record of adverse efects + files = ('DGIdb_interactions.tsv', 'oncokb.tsv', 'civic.tsv', 'DrugBank.tsv', 'moalmanac.tsv', 'GDSC.tsv', 'sabdab.tsv') + + outputf = open(out_dir+'PanDrugs_prov.tsv','w') + outputf.write('\t'.join(['gene_symbol', 'checked_gene_symbol', 'source', 'source_drug_name', 'standard_drug_name', 'show_drug_name', 'family', 'status', 'pathology', 'cancer', 'extra', 'extra2', 'pathways', 'target_marker', 'resistance', 'alteration', 'ind_pathway', 'gene_dependency', 'dscore', 'gscore', 'reviews'])+'\n') + + for fil in files: + print('>Processing '+fil+'...') + + drug_gene_a = pd.read_csv(pro_dir+fil, sep='\t', low_memory=False) + drug_gene_a['drug_name'] = drug_gene_a['drug_name'].str.upper() + drug_gene_a = drug_gene_a.fillna('') + drug_gene_a.drop_duplicates(inplace=True) + + for i in progressbar.progressbar(range(len(drug_gene_a.index))): + row = drug_gene_a.iloc[i] + + (gene_symbol, source, source_drug_name) = ('', '', '') + + gene_symbol = re.split(';|,',row['gene_name'].upper()) + source = row['source'] + source_drug_name = row['drug_name'].upper() + source_drug_name_reduced = source_drug_name.split(' ')[0] + + for gs in gene_symbol: + + (checked_gene_symbol, standard_drug_name, show_drug_name, family, status, pathology, cancer, extra, extra2, pathways, target_marker, resistance, alteration, ind_pathway, gene_dependency, dscore, gscore, reviews) = ('', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', []) + + #checked gene symbol + checked_gene_symbol = checked_gene_symbol_file.loc[checked_gene_symbol_file['gene_name'] == gs]['checked_gene_symbol'].tolist()[0] + standard_drug_name = drug_names_file.loc[drug_names_file['drug_name'] == source_drug_name]['standard_drug_name'].tolist()[0] + standard_drug_name_reduced = standard_drug_name.split(' ')[0] + + #drug names file + if drug_names_file.loc[drug_names_file['drug_name'] == source_drug_name]['show_drug_name'].tolist()[0] != '': + show_drug_name = drug_names_file.loc[drug_names_file['drug_name'] == source_drug_name]['show_drug_name'].tolist()[0] + else: + show_drug_name = standard_drug_name + + show_drug_name_reduced = show_drug_name.split(' ')[0] + drug_list = [standard_drug_name, show_drug_name, source_drug_name, standard_drug_name_reduced, show_drug_name_reduced, source_drug_name_reduced] + + #Drug_type + extra2 = drug_approved_data_file.loc[drug_approved_data_file['standard_drug_name'] == standard_drug_name]['drug_type_mc'].tolist()[0] + + families = [] + #Drug family KEGG + found = False + for idx, d in enumerate(drug_list): + if d in family_KEGG_file['drug'].tolist(): + families = families + [x+'(KEGG)' for x in family_KEGG_file.loc[family_KEGG_file['drug'] == d]['family2'].tolist()] + if idx > 2: reviews.append('KEGG') + found = True + break + if not found: + for idx, d in enumerate(drug_list): + if d in family_KEGG_file['drug'].str.split(' ', n = 1, expand = True)[0].tolist(): + index = family_KEGG_file['drug'].str.split(' ', n = 1, expand = True)[0].tolist().index(d) + families = families + [x+'(KEGG)' for x in family_KEGG_file.iloc[[index]]['family2'].tolist()] + reviews.append('KEGG') + break + + #Drug family cmap + found = False + for idx, d in enumerate(drug_list): + if d in family_cmap_file['drug'].tolist(): + families = families + [x+'(Cmap)' for x in family_cmap_file.loc[family_cmap_file['drug'] == d]['moa'].tolist()] + if idx > 2: reviews.append('Cmap') + found = True + break + if not found: + for idx, d in enumerate(drug_list): + if d in family_cmap_file['drug'].str.split(' ', n = 1, expand = True)[0].tolist(): + index = family_cmap_file['drug'].str.split(' ', n = 1, expand = True)[0].tolist().index(d) + families = families + [x+'(Cmap)' for x in family_cmap_file.iloc[[index]]['moa'].tolist()] + reviews.append('Cmap') + break + + if len(families) > 0: family = ', '.join(families) + else: family = 'Other' + + #Status + status = drug_approved_data_file.loc[drug_approved_data_file['standard_drug_name'] == standard_drug_name]['status_mc'].tolist()[0] + + #Labels info + if status == 'Approved': + pathology = drug_approved_data_file.loc[drug_approved_data_file['standard_drug_name'] == standard_drug_name]['pathology_mc'].tolist()[0] + extra = drug_approved_data_file.loc[drug_approved_data_file['standard_drug_name'] == standard_drug_name]['indication_mc'].tolist()[0] + + cancer = drug_approved_data_file.loc[drug_approved_data_file['standard_drug_name'] == standard_drug_name]['cancer_type_mc'].tolist()[0] + + #Target_marker + if source in ['CancerCommons', 'ClearityFoundationClinicalTrial', 'DrugBank', 'MyCancerGenome', 'TALC', 'TEND', 'TTD', 'SAbDab']: + target_marker = 'target' + else: + target_marker = 'marker' + + #Pathway members + if len(ind_pathway_file.loc[ind_pathway_file['gene'] == checked_gene_symbol]) > 0: + ind_pathway = ind_pathway_file.loc[ind_pathway_file['gene'] == checked_gene_symbol]['upstream_genes'].tolist()[0] + + #Genetic dependencies + if checked_gene_symbol in gene_dependency_file['gene'].tolist(): + gene_dependency = gene_dependency_file[gene_dependency_file['gene'] == checked_gene_symbol]['genetic_dependency'].tolist()[0] + + #Resistance and alteration + resistance = 'sensitivity' + if source == 'CIViC': + if len(civic_sen_res[(civic_sen_res['drug_name'] == source_drug_name) & (civic_sen_res['gene_name'] == gs)]['response'].tolist()) > 0: + resistance = civic_sen_res[(civic_sen_res['drug_name'] == source_drug_name) & (civic_sen_res['gene_name'] == gs)]['response'].tolist()[0] + alteration = civic_sen_res[(civic_sen_res['drug_name'] == source_drug_name) & (civic_sen_res['gene_name'] == gs)]['variation'].tolist()[0] + elif source == 'GDSC': + if len(gdsc_sen_res[(gdsc_sen_res['drug_name'] == source_drug_name) & (gdsc_sen_res['gene_name'] == gs)]['response'].tolist()) > 0: + resistance = gdsc_sen_res[(gdsc_sen_res['drug_name'] == source_drug_name) & (gdsc_sen_res['gene_name'] == gs)]['response'].tolist()[0] + alteration = gdsc_sen_res[(gdsc_sen_res['drug_name'] == source_drug_name) & (gdsc_sen_res['gene_name'] == gs)]['alteration'].tolist()[0] + elif source == 'MOAlmanac': + if len(moalmanac_sen_res[(moalmanac_sen_res['drug_name'] == source_drug_name) & (moalmanac_sen_res['gene_name'] == gs)]['response'].tolist()[0]) > 0: + resistance = moalmanac_sen_res[(moalmanac_sen_res['drug_name'] == source_drug_name) & (moalmanac_sen_res['gene_name'] == gs)]['response'].tolist()[0] + alteration = moalmanac_sen_res[(moalmanac_sen_res['drug_name'] == source_drug_name) & (moalmanac_sen_res['gene_name'] == gs)]['alteration'].tolist()[0] + elif source == 'OncoKB': + if len(oncokb_sen_res[(oncokb_sen_res['drug_name'] == source_drug_name) & (oncokb_sen_res['gene_name'] == gs)]['response'].tolist()) > 0: + resistance = oncokb_sen_res[(oncokb_sen_res['drug_name'] == source_drug_name) & (oncokb_sen_res['gene_name'] == gs)]['response'].tolist()[0] + alteration = oncokb_sen_res[(oncokb_sen_res['drug_name'] == source_drug_name) & (oncokb_sen_res['gene_name'] == gs)]['variation'].tolist()[0] + elif source == 'COSMIC': + resistance = 'resistance' + + #pathways + if len(gene_names.loc[gene_names['Approved symbol']==checked_gene_symbol,]['NCBI Gene ID'].tolist()) > 0: + NCBI_name = gene_names.loc[gene_names['Approved symbol']==checked_gene_symbol,]['NCBI Gene ID'].tolist()[0] + if len(gene_pathways.loc[gene_pathways['KEGG Gene ID'] == NCBI_name]['KEGG Pathway ID'].tolist()) > 0: + pathways = gene_pathways.loc[gene_pathways['KEGG Gene ID'] == NCBI_name]['KEGG Pathway ID'].tolist()[0] + + gscore = str(compute_gscore(checked_gene_symbol)) + + outputf.write('\t'.join([gs, checked_gene_symbol, source, source_drug_name, str(standard_drug_name), str(show_drug_name), family, status, pathology, cancer, extra, str(extra2), pathways, target_marker, resistance, alteration, ind_pathway, gene_dependency, dscore, gscore, ';'.join(reviews)])+'\n') + + outputf.close() + + print('****IMPORTANT****'+'\n'+'Before creating the final PanDrugs file:') + print('-> Check the records with review comments and modify them accordingly') + print('-> Check the associations of KRAS, TP53, STK11 and APC (using checked_gene_symbol column) with drugs (using standard_drug_name column) with a target relationship (using target_marker column) in PanDrugs file. Update them in the control_records.tsv file along with the corresponding tag (keep/exclude/target/marker).') + print('-> Recover EGFR (using checked_gene_symbol column) - drug (using standard_drug_name column) associations with a target relationship (using target_marker column) in PanDrugs file. For each record without any described targeted MET association with the drug, create a MET-drug association and update them in the control_records.tsv file along with the corresponding tag (amp_resistance).') + print('Then re-run in create final file mode to create the final PanDrugs file.'+'\n'+'*****************') + +def create_final_file(): + + print('Creating final file...') + + pandrugs = pd.read_csv(out_dir+'PanDrugs_prov.tsv', sep='\t', low_memory=False) + pandrugs = pandrugs.fillna('') + + #update all target records + print('Updating controled records...') + target = pandrugs.loc[pandrugs['target_marker'] == 'target'].drop_duplicates(subset=['checked_gene_symbol', 'standard_drug_name']) + + for index, row in target.iterrows(): + pandrugs.loc[((pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']) & (pandrugs['standard_drug_name'] == row['standard_drug_name'])), 'target_marker'] = 'target' + + conres = pd.read_csv(add_dir+'controled_records.tsv', sep='\t', low_memory=False) + + for index, row in conres.loc[conres['reason'] == 'marker'].iterrows(): + pandrugs.loc[((pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']) & (pandrugs['standard_drug_name'] == row['standard_drug_name'])), 'target_marker'] = 'marker' + + for index, row in conres.loc[conres['reason'] == 'exclude'].iterrows(): + pandrugs = pandrugs.drop(pandrugs.loc[((pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']) & (pandrugs['standard_drug_name'] == row['standard_drug_name']))].index) + + for index, row in conres.loc[conres['reason'] == 'amp_resistance'].iterrows(): + pandrugs = pandrugs.reset_index(drop=True) + pandrugs.loc[len(pandrugs.index)] = [row['checked_gene_symbol'], row['checked_gene_symbol'], 'Curated', row['standard_drug_name'], row['standard_drug_name']]+pandrugs.loc[pandrugs['standard_drug_name'] == row['standard_drug_name']][['show_drug_name', 'family', 'status', 'pathology', 'cancer', 'extra', 'extra2']].iloc[0].tolist()+pandrugs.loc[pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']][['pathways']].iloc[0].tolist()+['marker', 'resistance', 'amplification']+pandrugs.loc[pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']][['ind_pathway', 'gene_dependency']].iloc[0].tolist()+['']+pandrugs.loc[pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']][['gscore']].iloc[0].tolist()+[''] + + for index, row in conres.loc[conres['reason'] == 'sensitivity'].iterrows(): + pandrugs.loc[((pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']) & (pandrugs['standard_drug_name'] == row['standard_drug_name'])), 'resistance'] = 'sensitivity' + for index, row in conres.loc[conres['reason'] == 'resistance'].iterrows(): + pandrugs.loc[((pandrugs['checked_gene_symbol'] == row['checked_gene_symbol']) & (pandrugs['standard_drug_name'] == row['standard_drug_name'])), 'resistance'] = 'resistance' + + #homogenize target-marker for salt correction + print('Homogenizing target-marker information...') + genes = list(set(pandrugs['checked_gene_symbol'].tolist())) + + for g in genes: + subset = pandrugs.loc[pandrugs['checked_gene_symbol'] == g] + drugs = list(set(subset['show_drug_name'].tolist())) + for d in drugs: + drug = d.split(' ') + if len(drug) > 1 and drug[0] in drugs: + if subset.loc[(subset['checked_gene_symbol'] == g) & (subset['show_drug_name'] == d)]['target_marker'].tolist()[0] != subset.loc[(subset['checked_gene_symbol'] == g) & (subset['show_drug_name'] == drug[0])]['target_marker'].tolist()[0]: + idx = pandrugs.index[(pandrugs['checked_gene_symbol'] == g) & (pandrugs['show_drug_name'] == d)].tolist() + pandrugs.loc[idx, ['target_marker']] = 'target' + + idx = pandrugs.index[(pandrugs['checked_gene_symbol'] == g) & (pandrugs['show_drug_name'] == drug[0])].tolist() + pandrugs.loc[idx, ['target_marker']] = 'target' + + #homogenize drug families + print('Homogenizing drug families...') + drugs = list(set(pandrugs['show_drug_name'].tolist())) + for d in drugs: + f_values = list(set(pandrugs.loc[pandrugs['show_drug_name'] == d]['family'].tolist())) + if len(f_values) > 1: + idx = pandrugs.index[pandrugs['show_drug_name'] == d].tolist() + pandrugs.loc[idx, ['family']] = [x for x in f_values if x != 'Other'][0] + + #Drug-score calculation + for index, row in pandrugs.iterrows(): + dscore = 0 + if pandrugs.iloc[index]['status'] == 'Approved': + if pandrugs.iloc[index]['extra2'] != '': + if pandrugs.iloc[index]['target_marker'] == 'target': dscore = 1 + else: dscore = 0.9 + else: + if pandrugs.iloc[index]['cancer'] == 'clinical cancer': + if pandrugs.iloc[index]['target_marker'] == 'target': dscore = 0.8 + else: dscore = 0.7 + else: + if pandrugs.iloc[index]['target_marker'] == 'target': dscore = 0.4 + else: dscore = 0.3 + if pandrugs.iloc[index]['status'] == 'Clinical trials': + if pandrugs.iloc[index]['cancer'] == 'cancer': + if pandrugs.iloc[index]['target_marker'] == 'target': dscore = 0.6 + else: dscore = 0.5 + else: + if pandrugs.iloc[index]['target_marker'] == 'target': dscore = 0.2 + else: dscore = 0.1 + if pandrugs.iloc[index]['status'] == 'Experimental': + if pandrugs.iloc[index]['target_marker'] == 'target': dscore = 0.0008 + else: dscore = 0.0004 + if pandrugs.iloc[index]['resistance'] == 'resistance': + dscore = dscore * (-1) + + pandrugs.loc[index, 'dscore'] = dscore + + #remove duplicates due to repeated genes in original databases + pandrugs.drop_duplicates(inplace=True) + + #add GScore and gene role + gscore = pd.read_csv(add_dir+ "gscore_Ene_2023.tsv", sep='\t', low_memory=False) + driver = pd.read_csv(add_dir+ "drivers_Ene_2023.tsv", sep='\t', low_memory=False) + + pandrugs_gs = pd.merge(pandrugs, gscore, on="checked_gene_symbol", how='left') + pandrugs_gs_dr = pd.merge(pandrugs_gs, driver, on="checked_gene_symbol", how='left') + + pandrugs_gs_dr.to_csv(out_dir+out_file, sep='\t', index=False, header=True) + +def create_pubchem_ids_file(): + + print('Creating PubChemID file...') + + pandrugs = pd.read_csv(out_dir+out_file, sep='\t', low_memory=False) + pandrugs = pandrugs.fillna('') + + drugs = pandrugs['show_drug_name'] + + outputf = open(out_dir+'show_drug_name_lk_pubchem.tsv', 'w') + outputf.write('\t'.join(['show_drug_name', 'PubChemID'])+'\n') + + import pubchempy as pcp + + drugs = list(set(drugs)) + for i in progressbar.progressbar(range(len(drugs))): + response = pcp.get_cids(drugs[i], 'name') + CID = '' + if len(response) > 0: + CID = '|'.join([str(x) for x in response]) + outputf.write('\t'.join([drugs[i], CID])+'\n') + + outputf.close() + +def compute_gscore(gene): + gscore = 0 + if not cgc.loc[cgc['gene'] == gene].empty: + if '1' in list(set(cgc.loc[cgc['gene'] == gene]['Tier'])): gscore += 0.2 + else: gscore += 0.1 + + if not intogen_group.loc[intogen_group['gene_name'] == gene].empty: + qvalue = intogen_group.loc[intogen_group['gene_name'] == gene]['qvalue_combination'].tolist()[0] + if qvalue <= 0.05: gscore += 0.2 * (qvalue - intogen_min) / (intogen_max - intogen_min) + + if not oncovar.loc[(oncovar['cancer_type'] == 'PanCancer') & (oncovar['gene'] == gene)].empty: + score = float(oncovar.loc[(oncovar['cancer_type'] == 'PanCancer') & (oncovar['gene'] == gene)]['Consensus_Score'].tolist()[0]) + if score <= 3: + gscore += 0.1 * (score - 0) / (3 - 0) + else: + gscore += 0.1 + (0.1 * (score - 3) / (oncovar_max - 3)) + + if not chronos.loc[chronos['gene'] == gene].empty: + value = chronos.loc[chronos['gene'] == gene]['value'].tolist()[0] + if value < -2: gscore += 0.2 + elif value <= -0.5: gscore += 0.2 * (value - chronos_min) / (-2 - chronos_min) + + if not hallmarks.loc[hallmarks['Genes_list'] == gene].empty: + n_hall = len(list((set(hallmarks.loc[hallmarks['Genes_list'] == gene]['Hallmark'])))) + if n_hall >= 5: gscore += 0.2 + else: gscore += 0.2 * (n_hall - 0) / (5 - 0) + + return round(gscore, 4) + +load_files() + +if sys.argv[1] == '1': + drug_gene_associations() +else: + create_final_file() + create_pubchem_ids_file() diff --git a/src/download_preprocess_drugbank.R b/src/download_preprocess_drugbank.R new file mode 100644 index 0000000..0664e14 --- /dev/null +++ b/src/download_preprocess_drugbank.R @@ -0,0 +1,39 @@ +rm(list = ls()) # R version 4.1.3 +library(XML) # XML_3.99-0.9 +library(dbparser) # dbparser_1.2.0 +library(tidyverse) # tidyverse_1.3.1 +setwd(".") +outdir <- "results/" + +# --- Data --- +read_drugbank_xml_db("data/drugbank.xml") + +# --- Code --- +# Outdir +dir.create(outdir, showWarnings = FALSE, recursive = TRUE) + +# Run parsers +drugBank <- run_all_parsers() + +# Drug names +dnames <- drugBank$general_information %>% + rename(preferred.DB = name, DB.IDs = primary_key) %>% + group_by(DB.IDs) %>% + mutate(n.preferred = length(unique(preferred.DB))) %>% + select(preferred.DB, DB.IDs, n.preferred) %>% unique +table(dnames$n.preferred) # One term per drug + +# Targets +targets <- drugBank$targets_polypeptides %>% select(gene_name, parent_id) %>% + rename(id = parent_id) %>% unique %>% + merge(drugBank$targets, by = "id") %>% filter(organism == "Humans") %>% + rename(targets = gene_name, DB.IDs = parent_key) %>% + select(targets, DB.IDs) %>% unique + +# Drug-gene interactions +drug_gene <- merge(dnames, targets, by = "DB.IDs", all = TRUE) %>% + select(-n.preferred) %>% na.omit() %>% filter(targets != "") %>% unique() + +# Save +write.table(drug_gene, file = paste0(outdir, "DrugBank.tsv"), col.names = TRUE, + row.names = FALSE, sep = "\t") diff --git a/src/downloads.py b/src/downloads.py new file mode 100644 index 0000000..52a46d4 --- /dev/null +++ b/src/downloads.py @@ -0,0 +1,410 @@ +#!/usr/bin/python + +import os +import re +import subprocess +import pdb +import gzip +import wget +import pandas as pd +import progressbar +import urllib.request + +#Set to the desired download directory +dwn_dir = 'downloads/' + +#URLs for downloadable files +gencode = 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/latest_release/gencode.v39.annotation.gtf.gz' +gdsc = 'ftp://ftp.sanger.ac.uk/pub/project/cancerrxgene/releases/current_release/ANOVA_results_GDSC2_20Feb20.xlsx' +features = 'https://www.cancerrxgene.org/downloads/download/genetic_feature' +kegg_ATC = 'https://www.genome.jp/kegg-bin/download_htext?htext=br08310&format=htext&filedir=' +cmap = 'https://api.clue.io/api/rep_drug_moas/?filter[skip]=0&user_key=' #Access clue web and retrieve user key +fda = 'https://www.fda.gov/media/89850/download' +fda_label = ['https://download.open.fda.gov/drug/label/drug-label-0001-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0002-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0003-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0004-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0005-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0006-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0007-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0008-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0009-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0010-of-0011.json.zip', 'https://download.open.fda.gov/drug/label/drug-label-0011-of-0011.json.zip'] +ema = 'https://www.ema.europa.eu/sites/default/files/Medicines_output_european_public_assessment_reports.xlsx' +ct = 'https://clinicaltrials.gov/AllPublicXML.zip' +kegg_ind = 'http://rest.kegg.jp/get/$path/kgml' # $path will be substituted for pathway's codes +cgc = 'https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v95/cancer_gene_census.csv' +oncovar = 'https://oncovar.org/resource/download/All_genes_OncoVar_TCGA.tar.gz' +therasabdab = 'http://opig.stats.ox.ac.uk/webapps/newsabdab/static/downloads/TheraSAbDab_SeqStruc_OnlineDownload.csv' +intogen = 'https://www.intogen.org/download?file=IntOGen-Drivers-20200201.zip' +depmap = 'https://ndownloader.figshare.com/files/34990036' +genes_dwn = 'https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_pub_eg_id&status=Approved&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit' +genepathway_dwn = 'http://rest.kegg.jp/link/pathway/hsa' +pathwaydesc_dwn = 'http://rest.kegg.jp/list/pathway/hsa' +sl = 'https://figshare.com/ndownloader/files/36451119?private_link=a035301c05daa2ce668e' +cosmic_link = '' +clinvar_dwn = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz' +pfam_dwn = 'ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz' +interpro_dwn = 'ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/match_complete.xml.gz' +uniprot_dwn = 'ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz' +hallmarks_dwn = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-018-25076-6/MediaObjects/41598_2018_25076_MOESM10_ESM.xlsx' + +#place authentication string for COSMIC download +auth_str = '' + +def download_DGIdb(): + print('Dowloading genecode annotations...') + gencode_dwn = wget.download(gencode, out=dwn_dir) + print(gencode_dwn) + + print('Obtaining list of human genes...') + + #Obtain the name of the downloaded file from gencode + gencode_dwn = '' + listdir = os.listdir(dwn_dir) + for efile in listdir: + if re.search('^gencode', efile) != None: gencode_dwn = efile + + #Retrieve the gene names + genes_file = 'genes_fromGFT.tsv' + genes = [] + inputf = gzip.open(dwn_dir+gencode_dwn,'rt') + + for line in inputf: + line = line.rstrip("\n") + line_a = line.split("\t") + if re.search("##", line_a[0]) == None: + names = line_a[8] + pattern = re.compile('; gene_name "(.+?)";') + gene = pattern.search(names) + if gene.group(1) not in genes: genes.append(gene.group(1)) + inputf.close() + + print('Total number of genes: '+str(len(genes))) + df = pd.DataFrame(genes) + df.to_csv(dwn_dir+genes_file, index=False, sep='\t', header=False) + print('Retrieving data from DGIdb...') + #For each gene in the list I retrieve the drug-gene associations in DGIdb + dgidb_dwn = 'DGIdb_interactions_dwn.tsv' + open(dwn_dir+dgidb_dwn, 'w').close() + filei = pd.read_csv(dwn_dir+genes_file, sep='\t', low_memory=False, header=None) + genes = filei.iloc[:, 0].tolist() + + import progressbar + + for i in progressbar.progressbar(range(len(genes))): + command = "python ./python_example.py --genes='"+genes[i]+"' >> "+dwn_dir+dgidb_dwn + subprocess.call(command, shell=True) + +def download_therasabdab(): + + print('Dowloading Thera-SAbDab annotations...') + + sabdab_dwn = wget.download(therasabdab, out=dwn_dir) + print(sabdab_dwn) + +def download_moalmanac(): + + print('Dowloading moalmanac associations...') + + command = "curl -X 'GET' 'https://moalmanac.org/api/assertions' -H 'accept: */*' | jq -r '.[] | \"\\(.assertion_id)\t\\(.features[].attributes[].feature_type)\t\\(.features[].attributes[].gene)\t\\(.features[].attributes[].gene1)\t\\(.features[].attributes[].gene2)\t\\(.therapy_name)\t\\(.therapy_resistance)\t\\(.therapy_sensitivity)\t\\(.therapy_type)\t\\(.features[].attributes[].protein_change)\t\\(.predictive_implication)\t\\(.validated)\"' > "+dwn_dir+"moalmanac_dwn.tsv" + subprocess.call(command,shell=True) + +def download_GDSC(): + + print('Dowloading GDSC associations and features...') + + gdsc_dwn = wget.download(gdsc, out=dwn_dir) + print(gdsc_dwn) + + features_dwn = wget.download(features, out=dwn_dir+'GDSC_features.csv') + print(features_dwn) + +def download_KEGG_TB(): + + print('Dowloading KEGG Target-based classification of drugs...') + + kegg_atc_dwn = wget.download(kegg_ATC, out=dwn_dir) + print(kegg_atc_dwn) + +def download_cmap(): + + print('Downloading CLUE Repurposing data...') + + moa_file = 'cmap_moa.tsv' + + stop = 0 + skip = 0 + + append_file = open(dwn_dir+moa_file, 'w') + append_file.close() + + while stop == 0: + + cmap_dwn = wget.download(cmap.replace('[skip]=0','[skip]='+str(skip)), out=dwn_dir) + print(cmap_dwn) + + dwn_file = open(dwn_dir+'download.wget', 'r') + dwn_file_data = dwn_file.read() + if dwn_file_data.count('pert_iname') < 1000: stop = 1 + dwn_file.close() + + append_file = open(dwn_dir+moa_file, 'a') + append_file.write(dwn_file_data+'\n') + append_file.close() + + os.remove(dwn_dir+'download.wget') + skip = skip+1000 + +def download_FDA(): + + print('Downloading FDA data ...') + + fda_dwn = wget.download(fda, out=dwn_dir) + print(fda_dwn) + +def download_FDA_labels(): + + print('Downloading FDA label data ...') + + for e in fda_label: + fda_label_dwn = wget.download(e, out=dwn_dir) + print(fda_label_dwn) + +def download_EMA(): + + print('Downloading EMA data ...') + + ema_dwn = wget.download(ema, out=dwn_dir) + print(ema_dwn) + +def download_ct(): + + print('Downloading Clinical Trial records...') + + ct_dwn = wget.download(ct, out=dwn_dir) + print(ct_dwn) + +def download_KEGG_ind(): + + print('Downloading KEGG pathways...') + + KEGG_dir = dwn_dir+'KEGGmodeled' + + if not os.path.exists(KEGG_dir): os.makedirs(KEGG_dir) + + pathways = ['hsa03320', 'hsa04010', 'hsa04012', 'hsa04014', 'hsa04015', 'hsa04020', 'hsa04022', 'hsa04024', 'hsa04062', 'hsa04064', 'hsa04066', 'hsa04068', 'hsa04071', 'hsa04110', 'hsa04114', 'hsa04115', 'hsa04150', 'hsa04151', 'hsa04152', 'hsa04210', 'hsa04261', 'hsa04270', 'hsa04310', 'hsa04330', 'hsa04340', 'hsa04350', 'hsa04370', 'hsa04390', 'hsa04510', 'hsa04520', 'hsa04530', 'hsa04540', 'hsa04611', 'hsa04620', 'hsa04621', 'hsa04622', 'hsa04630', 'hsa04650', 'hsa04660', 'hsa04662', 'hsa04664', 'hsa04666', 'hsa04668', 'hsa04670', 'hsa04722', 'hsa04910', 'hsa04912', 'hsa04914', 'hsa04915', 'hsa04916', 'hsa04919', 'hsa04920', 'hsa04921', 'hsa04922', 'hsa04971', 'hsa05010', 'hsa05012', 'hsa05160', 'hsa05200', 'hsa05205', 'hsa05212', 'hsa05214', 'hsa05218', 'hsa05231'] + + for path in pathways: + command = 'wget -O '+KEGG_dir+'/'+path+'.kgml.xml http://rest.kegg.jp/get/'+path+'/kgml' + subprocess.call(command, shell=True) + +def download_CGC(): + + import ast + + print('Downloading CGC file...') + + command = 'curl -H "Authorization: Basic ' + auth_str + '" ' + cgc + p = subprocess.Popen(['curl', '-H', 'Authorization: Basic ' + auth_str, cgc], stdout=subprocess.PIPE) + out, err = p.communicate() + + cgc_dwn = wget.download(ast.literal_eval(out.decode('utf-8'))['url'], out=dwn_dir) + print(cgc_dwn) + +def download_oncovar(): + + print('Downloading oncovar files...') + + oncovar_dwn = wget.download(oncovar, out=dwn_dir) + print(oncovar_dwn) + +def download_CIViC_evidence(): + + print('Downloading CIViC evidence...') + + import requests + import json + + url='https://civicdb.org/api/graphql' + + nodes = [] + + query = """query evidenceItems($evidenceType: EvidenceType){ + evidenceItems(evidenceType: $evidenceType) { + nodes { + id + } + pageCount + pageInfo { + endCursor + startCursor + } + totalCount + } + }""" + + variables = """{"evidenceType": "PREDICTIVE"}""" + + r = requests.post(url, json={'query': query, 'variables': variables}) + if r.status_code == 200: + data = json.loads(r.text) + for n in data['data']['evidenceItems']['nodes']: + nodes.append(n['id']) + + query = """query evidenceItems($evidenceType: EvidenceType, $after: String){ + evidenceItems(evidenceType: $evidenceType, after: $after) { + nodes { + id + } + pageCount + pageInfo { + endCursor + startCursor + } + totalCount + } + }""" + + for i in range(1,data['data']['evidenceItems']['pageCount']): + variables = """{"evidenceType": "PREDICTIVE", "after": \""""+data['data']['evidenceItems']['pageInfo']['endCursor']+"""\"}""" + r = requests.post(url, json={'query': query, 'variables': variables}) + if r.status_code == 200: + data = json.loads(r.text) + for n in data['data']['evidenceItems']['nodes']: + nodes.append(n['id']) + else: + print('Error: ' + r.text) + else: + print('Error: ' + r.text) + + query = """query evidenceItem($id: Int!){ + evidenceItem(id: $id) { + drugs { name } + gene { name } + variant { name } + status + clinicalSignificance + evidenceDirection + evidenceLevel + } + }""" + + print('length nodes: '+str(len(nodes))) + count = 0 + output = open(dwn_dir+'civic_evidence.tsv','w') + output.write('\t'.join(['drug', 'gene', 'variant', 'status', 'clinical_significance', 'evidence_direction', 'evidenceLevel'])+'\n') + for n in progressbar.progressbar(range(len(nodes))): + count = count + 1 + variables = """{"id": """+str(nodes[n])+"""}""" + r = requests.post(url, json={'query': query, 'variables': variables}) + if r.status_code == 200: + data = json.loads(r.text) + gene = data['data']['evidenceItem']['gene']['name'] + variant = data['data']['evidenceItem']['variant']['name'] + status = data['data']['evidenceItem']['status'] + clinsig = data['data']['evidenceItem']['clinicalSignificance'] + evdirection = data['data']['evidenceItem']['evidenceDirection'] + evlevel = data['data']['evidenceItem']['evidenceLevel'] + drug = [] + for ele in data['data']['evidenceItem']['drugs']: + drug.append(ele['name']) + output.write('\t'.join([' + '.join(sorted(drug)), gene, variant, status, clinsig, evdirection, evlevel])+'\n') + else: + print('Error: ' + r.text) + output.close() + print('counts: '+str(count)) + +def download_oncoKB_evidence(): + + print('Please, download oncoKB associations file from https://www.oncokb.org/actionableGenes and place it in downloads directory') + +def download_intogen(): + + print('Downloading IntOGen data...') + + intogen_dwn = wget.download(intogen, out=dwn_dir) + print(intogen_dwn) + +def download_depmap(): + + print('Downloading DepMap public score + Chronos...') + + depmap_dwn = wget.download(depmap, out=dwn_dir) + print(depmap_dwn) + +def download_gene_ids(): + + print('Downloading GeneIds...') + genes = wget.download(genes_dwn, out=dwn_dir) + print(genes) + +def download_KEGG_pathways(): + + genepathway_file = 'gene_pathway.tsv' + pathwaydesc_file = 'pathway_desc.tsv' + + print('Downloading GenePathways...') + gene_pathway = wget.download(genepathway_dwn, out=dwn_dir) + print(gene_pathway) + os.rename(dwn_dir+'hsa', dwn_dir+genepathway_file) + + print('Downloading Pathways descriptions...') + pathway_desc = wget.download(pathwaydesc_dwn, out=dwn_dir) + print(pathway_desc) + os.rename(dwn_dir+'hsa',dwn_dir+pathwaydesc_file) + +def download_SL(): + + print('Downloading SL interactions...') + sl_dwn = wget.download(sl, out=dwn_dir) + print(sl_dwn) + +def download_cosmic(): + print('Dowloading CosmicMutantExport...') + cosmic = wget.download(cosmic_link, out=dwn_dir) + print(cosmic) + +def download_clinvar(): + print('Dowloading ClinVar...') + clinvar = wget.download(clinvar_dwn, out=dwn_dir) + print(clinvar) + +def download_pfam(): + print('Dowloading Pfam...') + pfam = wget.download(pfam_dwn, out=dwn_dir) + print(pfam) + +def download_interpro(): + print('Dowloading Interpro...') + interpro = wget.download(interpro_dwn, out=dwn_dir) + print(interpro) + +def download_uniprot(): + print('Dowloading Uniprot...') + uniprot = wget.download(uniprot_dwn, out=dwn_dir) + print(uniprot) + +def download_hallmarks(): + print('Dowloading S8 from Hallmarks article...') + urllib.request.urlretrieve(hallmarks_dwn, dwn_dir+"hallmarks.xlsx") + +download_DGIdb() +download_therasabdab() +download_moalmanac() +download_GDSC() +download_KEGG_TB() +download_cmap() +download_FDA() +download_FDA_labels() +download_EMA() +download_ct() +download_KEGG_ind() +download_CGC() +download_oncovar() +download_CIViC_evidence() +download_oncoKB_evidence() +download_intogen() +download_depmap() +download_gene_ids() +download_KEGG_pathways() +download_SL() + +#exclusive for genomic annotation +download_cosmic() +download_clinvar() +download_pfam() +download_interpro() +download_uniprot() +download_hallmarks() diff --git a/src/manualreview.py b/src/manualreview.py new file mode 100644 index 0000000..e605d95 --- /dev/null +++ b/src/manualreview.py @@ -0,0 +1,243 @@ +#!/usr/bin/python + +import re +import pdb +import pandas as pd +import progressbar +import sys + +#Set to directory with processed files, pandrugs previous version and manual review files +pro_dir = 'processed/' +mr_dir = 'manual_review/' +pd_prev = 'Pandrugs_Feb2020.tsv' + +if len(sys.argv) < 2: + print("python manualreview.py level [being 1(create synonyms file), and 2(create drug info file)]") + sys.exit() +else: + print(sys.argv) + +def process_drug_names(): + + print('Creating file with synonyms for review...') + + drugs = [] + + files = ['DGIdb_interactions.tsv', 'oncokb.tsv', 'civic.tsv', 'DrugBank.tsv', 'moalmanac.tsv', 'GDSC.tsv', 'sabdab.tsv'] + + for f in files: + inputf=open(pro_dir+f,'r') + for line in inputf: + line = line.rstrip('\n') + line_a = line.split('\t') + if 'drug_name' in line_a: drug_index = line_a.index('drug_name') + drugs.append(line_a[drug_index].upper()) + inputf.close() + + outputf = open(mr_dir+synonyms_mr, 'w') + outputf.write('\t'.join(['drug_name', 'short_drug_name', 'standard_drug_name', 'short_standard_drug_name', 'show_drug_name', 'short_show_drug_name', 'synonyms', 'review'])+'\n') + + import pubchempy as pcp + + drugs = list(set(drugs)) + for i in progressbar.progressbar(range(len(drugs))): + response = pcp.get_synonyms(drugs[i], 'name') + INNname = '' + if len(response) > 0: + for e in response[0]['Synonym']: + if '[' in e and 'INN' in e and 'INN-' not in e: + INNname = e.split(' [')[0] + break + if INNname == '': INNname = response[0]['Synonym'][0] + outputf.write('\t'.join([drugs[i], drugs[i].split(' ')[0], response[0]['Synonym'][0], response[0]['Synonym'][0].split(' ')[0], INNname, INNname.split(' ')[0], '::'.join(response[0]['Synonym']), ''])+'\n') + else: + outputf.write('\t'.join([drugs[i], drugs[i].split(' ')[0], drugs[i], drugs[i].split(' ')[0], drugs[i], drugs[i].split(' ')[0], '', ''])+'\n') + + outputf.close() + + drug_file = pd.read_csv(mr_dir+synonyms_mr, sep ='\t', low_memory=False) + drug_file = drug_file.fillna('') + + drugsyn = {} + + for index in progressbar.progressbar(range(drug_file.index.size)): + row = drug_file.iloc[index] + synonyms = row['synonyms'].split('::') + for syn in synonyms: + if syn in drugsyn.keys(): + if row['drug_name'] not in drugsyn[syn].keys(): + drugsyn[syn][row['drug_name']] = row['standard_drug_name'] + else: + drugsyn[syn] = {} + drugsyn[syn][row['drug_name']] = row['standard_drug_name'] + + for syn in drugsyn.keys(): + if syn != '' and len(drugsyn[syn].keys()) > 1 and len(list(set(list(map(drugsyn[syn].get,list(drugsyn[syn].keys())))))) > 1: + for d in drugsyn[syn].keys(): + drug_file.loc[drug_file['drug_name'] == d,'review'] = ','.join(list(drugsyn[syn].keys())) + + drug_file.to_csv(mr_dir+synonyms_mr, index=False, sep='\t', header=True) + +def process_FDA_EMA(): + + print('Collecting FDA and EMA information about status...') + + fda_status_file = 'fda_status.tsv' + approved_drug_data_mr = 'drug_approved_data_mr.tsv' + ema_status_dwn = 'ema_status.tsv' + fda_labels_dwn = 'fda_labels.tsv' + + fda_status = pd.read_csv(pro_dir+fda_status_file, sep ='\t', low_memory=False) + fda_status.fillna('', inplace=True) + + ema_status = pd.read_csv(pro_dir+ema_status_dwn, sep ='\t', low_memory=False) + ema_status.fillna('', inplace=True) + ema_status['International non-proprietary name (INN) / common name'] = ema_status['International non-proprietary name (INN) / common name'].str.upper() + + fda_labels = pd.read_csv(pro_dir+fda_labels_dwn, sep ='\t', low_memory=False) + fda_labels.fillna('', inplace=True) + appl_list = fda_labels['ApplNo'].tolist() + + status_CT_file = pd.read_csv(pro_dir+'clinicaltrials.tsv', sep='\t', low_memory=False) + status_CT_file['drug'] = status_CT_file['drug'].str.upper() + status_CT_file[['drug']] = status_CT_file[['drug']].fillna('') + + fileo = open(mr_dir+approved_drug_data_mr, 'w') + fileo.write('\t'.join(['standard_drug_name','ApplNo','ApplNo_labels','status_FDA','status_EMA','ct_condition','status_mc','PrevStat','indication_FDA','indication_EMA','cancer_term','indication_mc','indication_prev','pathology_mc','pathology_prev','cancer_type_mc','cancer_type_prev','drug_type_mc','drug_type_prev','review'])+'\n') + + drug_list = list(set(synonyms['standard_drug_name'].tolist())) + for i in progressbar.progressbar(range(len(drug_list))): + standard_drug_name = drug_list[i] + #list of possible names in pandrugs database + synonyms_list = synonyms.loc[synonyms['standard_drug_name'] == standard_drug_name][['drug_name', 'show_drug_name', 'standard_drug_name', 'short_drug_name', 'short_standard_drug_name', 'short_show_drug_name']].values.flatten().tolist() + synonyms_list = [x for x in synonyms_list if str(x) != 'nan'] + + (fda_data,status_ema) = (['', ''],'') + review = [] + + #Data from FDA + found = False + for idx, syn in enumerate(synonyms_list): + if syn in fda_status['drug'].tolist(): + fda_data = [str(x) for x in fda_status.loc[fda_status['drug'] == syn].values.flatten().tolist()[1:]] + if idx in [1,3,5]: review.append('fda:'+fda_status.loc[fda_status['drug'] == syn]['drug'].values.flatten().tolist()[0]) + found = True + break + if not found: + for idx, syn in enumerate(synonyms_list): + if syn in fda_status['drug'].str.split(' ', n = 1, expand = True)[0].tolist(): + index = fda_status['drug'].str.split(' ', n = 1, expand = True)[0].tolist().index(syn) + fda_data = [str(x) for x in fda_status.iloc[[index]].values.flatten().tolist()[1:]] + review.append('fda:'+fda_status.iloc[[index]]['drug'].values.flatten().tolist()[0]) + break + + #Data from EMA + (status_ema, indication_ema) = ('', '') + found = False + for idx, syn in enumerate(synonyms_list): + if syn in ema_status['International non-proprietary name (INN) / common name'].tolist(): + status_ema = list(set(ema_status.loc[ema_status['International non-proprietary name (INN) / common name'] == syn]['Authorisation status'].values.flatten().tolist())) + indication_ema = [x.replace('\t',' ') for x in list(set(ema_status.loc[ema_status['International non-proprietary name (INN) / common name'] == syn]['Condition / indication'].values.flatten().tolist()))] + if idx in [1,3,5]: review.append('ema:'+ema_status.loc[ema_status['International non-proprietary name (INN) / common name'] == syn]['International non-proprietary name (INN) / common name'].values.flatten().tolist()[0]) + found = True + break + if not found: + for idx, syn in enumerate(synonyms_list): + if syn in ema_status['International non-proprietary name (INN) / common name'].str.split(' ', n = 1, expand = True)[0].tolist(): + index = ema_status['International non-proprietary name (INN) / common name'].str.split(' ', n = 1, expand = True)[0].tolist().index(syn) + status_ema = list(set(ema_status.iloc[[index]]['Authorisation status'].values.flatten().tolist())) + indication_ema = [x.replace('\t', ' ') for x in list(set(ema_status.iloc[[index]]['Condition / indication'].values.flatten().tolist()))] + indication_ema = [x.replace('\t',' ') for x in list(set(ema_status.loc[ema_status['International non-proprietary name (INN) / common name'] == syn]['Condition / indication'].values.flatten().tolist()))] + review.append('ema:'+ema_status.iloc[[index]]['International non-proprietary name (INN) / common name'].values.flatten().tolist()[0]) + break + + #Data from FDA labels + indications = [] + applnos = [] + + if fda_data[1] != '': + applno = fda_data[1].split(';') + + for app in applno: + app_label = '' + if 'NDA'+app.zfill(6) in appl_list: + app_label = 'NDA'+app.zfill(6) + elif 'ANDA'+app.zfill(6) in appl_list: + app_label = 'ANDA'+app.zfill(6) + elif 'BLA'+app.zfill(6) in appl_list: + app_label = 'BLA'+app.zfill(6) + if app_label != '': + for index2, row2 in fda_labels.loc[fda_labels['ApplNo'] == app_label].iterrows(): + indications.append(row2.tolist()[0]) + applnos.append(app) + + #Data from clinical trials + ct_condition = '' + found = False + for idx, syn in enumerate(synonyms_list): + if syn in status_CT_file['drug'].tolist(): + ct_condition = list(set(status_CT_file.loc[status_CT_file['drug'] == syn]['condition'].values.flatten().tolist())) + if idx in [1,3,5]: review.append('ct:'+status_CT_file.loc[status_CT_file['drug'] == syn]['drug'].values.flatten().tolist()[0]) + found = True + break + if not found: + for idx, syn in enumerate(synonyms_list): + if syn in status_CT_file['drug'].str.split(' ', n = 1, expand = True)[0].tolist(): + index = status_CT_file['drug'].str.split(' ', n = 1, expand = True)[0].tolist().index(syn) + ct_condition = list(set(status_CT_file.iloc[[index]]['condition'].values.flatten().tolist())) + review.append('ct:'+status_CT_file.iloc[[index]]['drug'].values.flatten().tolist()[0]) + break + ct_condition = [str(x) for x in ct_condition] + + #Information from previous version + prev_values = ['','','','',''] + if standard_drug_name in pandrugs_prev['standard_drug_name'].tolist(): + prev_values = pandrugs_prev.loc[pandrugs_prev['standard_drug_name'] == standard_drug_name][['status','pathology','cancer','extra','extra2']].values.tolist()[0] + elif standard_drug_name in pandrugs_prev['show_drug_name'].tolist(): + prev_values = pandrugs_prev.loc[pandrugs_prev['show_drug_name'] == standard_drug_name][['status','pathology','cancer','extra','extra2']].values.tolist()[0] + elif standard_drug_name in pandrugs_prev['source_drug_name'].tolist(): + prev_values = pandrugs_prev.loc[pandrugs_prev['source_drug_name'] == standard_drug_name][['status','pathology','cancer','extra','extra2']].values.tolist()[0] + else: + for syn in synonyms_list: + if syn in pandrugs_prev['standard_drug_name'].tolist(): + prev_values = pandrugs_prev.loc[pandrugs_prev['standard_drug_name'] == syn][['status','pathology','cancer','extra','extra2']].values.tolist()[0] + break + elif syn in pandrugs_prev['show_drug_name'].tolist() and str(syn) != 'nan': + prev_values = pandrugs_prev.loc[pandrugs_prev['show_drug_name'] == syn][['status','pathology','cancer','extra','extra2']].values.tolist()[0] + break + elif syn in pandrugs_prev['source_drug_name'].tolist() and str(syn) != 'nan': + prev_values = pandrugs_prev.loc[pandrugs_prev['source_drug_name'] == syn][['status','pathology','cancer','extra','extra2']].values.tolist()[0] + break + + #Type of therapy + dtype = '' + if re.search(('ib$|mab$|ib |mab '), standard_drug_name, re.IGNORECASE): + dtype = 'TARGETED THERAPY' + else: + synonyms_list = [x for x in synonyms_list if str(x) != 'nan'] + for syn in synonyms_list: + if re.search(('ib$|mab$|ib |mab '), syn, re.IGNORECASE): + dtype = 'TARGETED THERAPY' + break + + cancer_term = '' + if re.search('cancer|tumor|oma ','::'.join(list(set(indications)))) != None or re.search('cancer|tumor|oma ','::'.join(list(set(indication_ema)))) != None: + cancer_term = 'yes' + + fileo.write('\t'.join([standard_drug_name, fda_data[1], ';'.join(applnos), fda_data[0], '::'.join(status_ema), '::'.join(ct_condition), '',prev_values[0], '::'.join(list(set(indications))), '::'.join(indication_ema), cancer_term, '', prev_values[3], '', prev_values[1], '', prev_values[2], dtype, prev_values[4], ','.join(review)])+'\n') + + fileo.close() + +synonyms_mr = 'drug_synonyms_mr.tsv' + +if sys.argv[1] == '1': + process_drug_names() +else: + pandrugs_prev = pd.read_csv(pd_prev, sep ='\t', low_memory=False) + pandrugs_prev = pandrugs_prev.fillna('') + synonyms = pd.read_csv(mr_dir+synonyms_mr.replace('.tsv', 'd.tsv'), sep ='\t', low_memory=False) + synonyms['drug_name'] = synonyms['drug_name'].str.upper() + synonyms['standard_drug_name'] = synonyms['standard_drug_name'].str.upper() + synonyms['show_drug_name'] = synonyms['show_drug_name'].str.upper() + +# process_FDA_EMA() # run this after reviewing drug_names diff --git a/src/modules/Archive/Extract.pm b/src/modules/Archive/Extract.pm new file mode 100644 index 0000000..7597f91 --- /dev/null +++ b/src/modules/Archive/Extract.pm @@ -0,0 +1,1751 @@ +package Archive::Extract; +use if $] > 5.017, 'deprecate'; + +use strict; + +use Cwd qw[cwd chdir]; +use Carp qw[carp]; +use IPC::Cmd qw[run can_run]; +use FileHandle; +use File::Path qw[mkpath]; +use File::Spec; +use File::Basename qw[dirname basename]; +use Params::Check qw[check]; +use Module::Load::Conditional qw[can_load check_install]; +use Locale::Maketext::Simple Style => 'gettext'; + +### solaris has silly /bin/tar output ### +use constant ON_SOLARIS => $^O eq 'solaris' ? 1 : 0; +use constant ON_NETBSD => $^O =~ m!^(netbsd|minix)$! ? 1 : 0; +use constant ON_OPENBSD => $^O =~ m!^(openbsd|bitrig)$! ? 1 : 0; +use constant ON_FREEBSD => $^O =~ m!^(free|midnight|dragonfly)(bsd)?$! ? 1 : 0; +use constant ON_LINUX => $^O eq 'linux' ? 1 : 0; +use constant FILE_EXISTS => sub { -e $_[0] ? 1 : 0 }; + +### VMS may require quoting upper case command options +use constant ON_VMS => $^O eq 'VMS' ? 1 : 0; + +### Windows needs special treatment of Tar options +use constant ON_WIN32 => $^O eq 'MSWin32' ? 1 : 0; + +### we can't use this extraction method, because of missing +### modules/binaries: +use constant METHOD_NA => []; + +### If these are changed, update @TYPES and the new() POD +use constant TGZ => 'tgz'; +use constant TAR => 'tar'; +use constant GZ => 'gz'; +use constant ZIP => 'zip'; +use constant BZ2 => 'bz2'; +use constant TBZ => 'tbz'; +use constant Z => 'Z'; +use constant LZMA => 'lzma'; +use constant XZ => 'xz'; +use constant TXZ => 'txz'; + +use vars qw[$VERSION $PREFER_BIN $PROGRAMS $WARN $DEBUG + $_ALLOW_BIN $_ALLOW_PURE_PERL $_ALLOW_TAR_ITER + ]; + +$VERSION = '0.88'; +$PREFER_BIN = 0; +$WARN = 1; +$DEBUG = 0; +$_ALLOW_PURE_PERL = 1; # allow pure perl extractors +$_ALLOW_BIN = 1; # allow binary extractors +$_ALLOW_TAR_ITER = 1; # try to use Archive::Tar->iter if available + +# same as all constants +my @Types = ( TGZ, TAR, GZ, ZIP, BZ2, TBZ, Z, LZMA, XZ, TXZ ); + +local $Params::Check::VERBOSE = $Params::Check::VERBOSE = 1; +local $Module::Load::Conditional::FORCE_SAFE_INC = 1; + +=pod + +=head1 NAME + +Archive::Extract - A generic archive extracting mechanism + +=head1 SYNOPSIS + + use Archive::Extract; + + ### build an Archive::Extract object ### + my $ae = Archive::Extract->new( archive => 'foo.tgz' ); + + ### extract to cwd() ### + my $ok = $ae->extract; + + ### extract to /tmp ### + my $ok = $ae->extract( to => '/tmp' ); + + ### what if something went wrong? + my $ok = $ae->extract or die $ae->error; + + ### files from the archive ### + my $files = $ae->files; + + ### dir that was extracted to ### + my $outdir = $ae->extract_path; + + + ### quick check methods ### + $ae->is_tar # is it a .tar file? + $ae->is_tgz # is it a .tar.gz or .tgz file? + $ae->is_gz; # is it a .gz file? + $ae->is_zip; # is it a .zip file? + $ae->is_bz2; # is it a .bz2 file? + $ae->is_tbz; # is it a .tar.bz2 or .tbz file? + $ae->is_lzma; # is it a .lzma file? + $ae->is_xz; # is it a .xz file? + $ae->is_txz; # is it a .tar.xz or .txz file? + + ### absolute path to the archive you provided ### + $ae->archive; + + ### commandline tools, if found ### + $ae->bin_tar # path to /bin/tar, if found + $ae->bin_gzip # path to /bin/gzip, if found + $ae->bin_unzip # path to /bin/unzip, if found + $ae->bin_bunzip2 # path to /bin/bunzip2 if found + $ae->bin_unlzma # path to /bin/unlzma if found + $ae->bin_unxz # path to /bin/unxz if found + +=head1 DESCRIPTION + +Archive::Extract is a generic archive extraction mechanism. + +It allows you to extract any archive file of the type .tar, .tar.gz, +.gz, .Z, tar.bz2, .tbz, .bz2, .zip, .xz,, .txz, .tar.xz or .lzma +without having to worry how it +does so, or use different interfaces for each type by using either +perl modules, or commandline tools on your system. + +See the C section further down for details. + +=cut + + +### see what /bin/programs are available ### +$PROGRAMS = {}; +CMD: for my $pgm (qw[tar unzip gzip bunzip2 uncompress unlzma unxz]) { + if ( $pgm eq 'unzip' and ON_FREEBSD and my $unzip = can_run('info-unzip') ) { + $PROGRAMS->{$pgm} = $unzip; + next CMD; + } + if ( $pgm eq 'unzip' and ( ON_FREEBSD || ON_LINUX ) ) { + local $IPC::Cmd::INSTANCES = 1; + ($PROGRAMS->{$pgm}) = grep { _is_infozip_esque($_) } can_run($pgm); + next CMD; + } + if ( $pgm eq 'unzip' and ON_NETBSD ) { + local $IPC::Cmd::INSTANCES = 1; + ($PROGRAMS->{$pgm}) = grep { m!/usr/pkg/! } can_run($pgm); + next CMD; + } + if ( $pgm eq 'tar' and ( ON_OPENBSD || ON_SOLARIS || ON_NETBSD ) ) { + # try gtar first + next CMD if $PROGRAMS->{$pgm} = can_run('gtar'); + } + $PROGRAMS->{$pgm} = can_run($pgm); +} + +### mapping from types to extractor methods ### +my $Mapping = { # binary program # pure perl module + is_tgz => { bin => '_untar_bin', pp => '_untar_at' }, + is_tar => { bin => '_untar_bin', pp => '_untar_at' }, + is_gz => { bin => '_gunzip_bin', pp => '_gunzip_cz' }, + is_zip => { bin => '_unzip_bin', pp => '_unzip_az' }, + is_tbz => { bin => '_untar_bin', pp => '_untar_at' }, + is_bz2 => { bin => '_bunzip2_bin', pp => '_bunzip2_bz2'}, + is_Z => { bin => '_uncompress_bin', pp => '_gunzip_cz' }, + is_lzma => { bin => '_unlzma_bin', pp => '_unlzma_cz' }, + is_xz => { bin => '_unxz_bin', pp => '_unxz_cz' }, + is_txz => { bin => '_untar_bin', pp => '_untar_at' }, +}; + +{ ### use subs so we re-generate array refs etc for the no-override flags + ### if we don't, then we reuse the same arrayref, meaning objects store + ### previous errors + my $tmpl = { + archive => sub { { required => 1, allow => FILE_EXISTS } }, + type => sub { { default => '', allow => [ @Types ] } }, + _error_msg => sub { { no_override => 1, default => [] } }, + _error_msg_long => sub { { no_override => 1, default => [] } }, + }; + + ### build accessors ### + for my $method( keys %$tmpl, + qw[_extractor _gunzip_to files extract_path], + ) { + no strict 'refs'; + *$method = sub { + my $self = shift; + $self->{$method} = $_[0] if @_; + return $self->{$method}; + } + } + +sub type_for { + local $_ = shift; + return /.+?\.(?:tar\.gz|tgz)$/i ? TGZ : + /.+?\.gz$/i ? GZ : + /.+?\.tar$/i ? TAR : + /.+?\.(zip|jar|ear|war|par)$/i ? ZIP : + /.+?\.(?:tbz2?|tar\.bz2?)$/i ? TBZ : + /.+?\.bz2$/i ? BZ2 : + /.+?\.Z$/ ? Z : + /.+?\.lzma$/ ? LZMA : + /.+?\.(?:txz|tar\.xz)$/i ? TXZ : + /.+?\.xz$/ ? XZ : + ''; +} + +=head1 METHODS + +=head2 $ae = Archive::Extract->new(archive => '/path/to/archive',[type => TYPE]) + +Creates a new C object based on the archive file you +passed it. Automatically determines the type of archive based on the +extension, but you can override that by explicitly providing the +C argument, potentially by calling C. + +Valid values for C are: + +=over 4 + +=item tar + +Standard tar files, as produced by, for example, C. +Corresponds to a C<.tar> suffix. + +=item tgz + +Gzip compressed tar files, as produced by, for example C. +Corresponds to a C<.tgz> or C<.tar.gz> suffix. + +=item gz + +Gzip compressed file, as produced by, for example C. +Corresponds to a C<.gz> suffix. + +=item Z + +Lempel-Ziv compressed file, as produced by, for example C. +Corresponds to a C<.Z> suffix. + +=item zip + +Zip compressed file, as produced by, for example C. +Corresponds to a C<.zip>, C<.jar> or C<.par> suffix. + +=item bz2 + +Bzip2 compressed file, as produced by, for example, C. +Corresponds to a C<.bz2> suffix. + +=item tbz + +Bzip2 compressed tar file, as produced by, for example C. +Corresponds to a C<.tbz> or C<.tar.bz2> suffix. + +=item lzma + +Lzma compressed file, as produced by C. +Corresponds to a C<.lzma> suffix. + +=item xz + +Xz compressed file, as produced by C. +Corresponds to a C<.xz> suffix. + +=item txz + +Xz compressed tar file, as produced by, for example C. +Corresponds to a C<.txz> or C<.tar.xz> suffix. + +=back + +Returns a C object on success, or false on failure. + +=cut + + ### constructor ### + sub new { + my $class = shift; + my %hash = @_; + + ### see above why we use subs here and generate the template; + ### it's basically to not re-use arrayrefs + my %utmpl = map { $_ => $tmpl->{$_}->() } keys %$tmpl; + + my $parsed = check( \%utmpl, \%hash ) or return; + + ### make sure we have an absolute path ### + my $ar = $parsed->{archive} = File::Spec->rel2abs( $parsed->{archive} ); + + ### figure out the type, if it wasn't already specified ### + $parsed->{type} ||= type_for $ar; + + bless $parsed, $class; + + ### don't know what type of file it is + ### XXX this *has* to be an object call, not a package call + return $parsed->_error(loc("Cannot determine file type for '%1'", + $parsed->{archive} )) unless $parsed->{type}; + return $parsed; + } +} + +=head2 $ae->extract( [to => '/output/path'] ) + +Extracts the archive represented by the C object to +the path of your choice as specified by the C argument. Defaults to +C. + +Since C<.gz> files never hold a directory, but only a single file; if +the C argument is an existing directory, the file is extracted +there, with its C<.gz> suffix stripped. +If the C argument is not an existing directory, the C argument +is understood to be a filename, if the archive type is C. +In the case that you did not specify a C argument, the output +file will be the name of the archive file, stripped from its C<.gz> +suffix, in the current working directory. + +C will try a pure perl solution first, and then fall back to +commandline tools if they are available. See the C +section below on how to alter this behaviour. + +It will return true on success, and false on failure. + +On success, it will also set the follow attributes in the object: + +=over 4 + +=item $ae->extract_path + +This is the directory that the files where extracted to. + +=item $ae->files + +This is an array ref with the paths of all the files in the archive, +relative to the C argument you specified. +To get the full path to an extracted file, you would use: + + File::Spec->catfile( $to, $ae->files->[0] ); + +Note that all files from a tar archive will be in unix format, as per +the tar specification. + +=back + +=cut + +sub extract { + my $self = shift; + my %hash = @_; + + ### reset error messages + $self->_error_msg( [] ); + $self->_error_msg_long( [] ); + + my $to; + my $tmpl = { + to => { default => '.', store => \$to } + }; + + check( $tmpl, \%hash ) or return; + + ### so 'to' could be a file or a dir, depending on whether it's a .gz + ### file, or basically anything else. + ### so, check that, then act accordingly. + ### set an accessor specifically so _gunzip can know what file to extract + ### to. + my $dir; + { ### a foo.gz file + if( $self->is_gz or $self->is_bz2 or $self->is_Z or $self->is_lzma or $self->is_xz ) { + + my $cp = $self->archive; $cp =~ s/\.(?:gz|bz2?|Z|lzma|xz)$//i; + + ### to is a dir? + if ( -d $to ) { + $dir = $to; + $self->_gunzip_to( basename($cp) ); + + ### then it's a filename + } else { + $dir = dirname($to); + $self->_gunzip_to( basename($to) ); + } + + ### not a foo.gz file + } else { + $dir = $to; + } + } + + ### make the dir if it doesn't exist ### + unless( -d $dir ) { + eval { mkpath( $dir ) }; + + return $self->_error(loc("Could not create path '%1': %2", $dir, $@)) + if $@; + } + + ### get the current dir, to restore later ### + my $cwd = cwd(); + + my $ok = 1; + EXTRACT: { + + ### chdir to the target dir ### + unless( chdir $dir ) { + $self->_error(loc("Could not chdir to '%1': %2", $dir, $!)); + $ok = 0; last EXTRACT; + } + + ### set files to an empty array ref, so there's always an array + ### ref IN the accessor, to avoid errors like: + ### Can't use an undefined value as an ARRAY reference at + ### ../lib/Archive/Extract.pm line 742. (rt #19815) + $self->files( [] ); + + ### find out the dispatch methods needed for this type of + ### archive. Do a $self->is_XXX to figure out the type, then + ### get the hashref with bin + pure perl dispatchers. + my ($map) = map { $Mapping->{$_} } grep { $self->$_ } keys %$Mapping; + + ### add pure perl extractor if allowed & add bin extractor if allowed + my @methods; + push @methods, $map->{'pp'} if $_ALLOW_PURE_PERL; + push @methods, $map->{'bin'} if $_ALLOW_BIN; + + ### reverse it if we prefer bin extractors + @methods = reverse @methods if $PREFER_BIN; + + my($na, $fail); + for my $method (@methods) { + $self->debug( "# Extracting with ->$method\n" ); + + my $rv = $self->$method; + + ### a positive extraction + if( $rv and $rv ne METHOD_NA ) { + $self->debug( "# Extraction succeeded\n" ); + $self->_extractor($method); + last; + + ### method is not available + } elsif ( $rv and $rv eq METHOD_NA ) { + $self->debug( "# Extraction method not available\n" ); + $na++; + } else { + $self->debug( "# Extraction method failed\n" ); + $fail++; + } + } + + ### warn something went wrong if we didn't get an extractor + unless( $self->_extractor ) { + my $diag = $fail ? loc("Extract failed due to errors") : + $na ? loc("Extract failed; no extractors available") : + ''; + + $self->_error($diag); + $ok = 0; + } + } + + ### and chdir back ### + unless( chdir $cwd ) { + $self->_error(loc("Could not chdir back to start dir '%1': %2'", + $cwd, $!)); + } + + return $ok; +} + +=pod + +=head1 ACCESSORS + +=head2 $ae->error([BOOL]) + +Returns the last encountered error as string. +Pass it a true value to get the C output instead. + +=head2 $ae->extract_path + +This is the directory the archive got extracted to. +See C for details. + +=head2 $ae->files + +This is an array ref holding all the paths from the archive. +See C for details. + +=head2 $ae->archive + +This is the full path to the archive file represented by this +C object. + +=head2 $ae->type + +This is the type of archive represented by this C +object. See accessors below for an easier way to use this. +See the C method for details. + +=head2 $ae->types + +Returns a list of all known C for C's +C method. + +=cut + +sub types { return @Types } + +=head2 $ae->is_tgz + +Returns true if the file is of type C<.tar.gz>. +See the C method for details. + +=head2 $ae->is_tar + +Returns true if the file is of type C<.tar>. +See the C method for details. + +=head2 $ae->is_gz + +Returns true if the file is of type C<.gz>. +See the C method for details. + +=head2 $ae->is_Z + +Returns true if the file is of type C<.Z>. +See the C method for details. + +=head2 $ae->is_zip + +Returns true if the file is of type C<.zip>. +See the C method for details. + +=head2 $ae->is_lzma + +Returns true if the file is of type C<.lzma>. +See the C method for details. + +=head2 $ae->is_xz + +Returns true if the file is of type C<.xz>. +See the C method for details. + +=cut + +### quick check methods ### +sub is_tgz { return $_[0]->type eq TGZ } +sub is_tar { return $_[0]->type eq TAR } +sub is_gz { return $_[0]->type eq GZ } +sub is_zip { return $_[0]->type eq ZIP } +sub is_tbz { return $_[0]->type eq TBZ } +sub is_bz2 { return $_[0]->type eq BZ2 } +sub is_Z { return $_[0]->type eq Z } +sub is_lzma { return $_[0]->type eq LZMA } +sub is_xz { return $_[0]->type eq XZ } +sub is_txz { return $_[0]->type eq TXZ } + +=pod + +=head2 $ae->bin_tar + +Returns the full path to your tar binary, if found. + +=head2 $ae->bin_gzip + +Returns the full path to your gzip binary, if found + +=head2 $ae->bin_unzip + +Returns the full path to your unzip binary, if found + +=head2 $ae->bin_unlzma + +Returns the full path to your unlzma binary, if found + +=head2 $ae->bin_unxz + +Returns the full path to your unxz binary, if found + +=cut + +### paths to commandline tools ### +sub bin_gzip { return $PROGRAMS->{'gzip'} if $PROGRAMS->{'gzip'} } +sub bin_unzip { return $PROGRAMS->{'unzip'} if $PROGRAMS->{'unzip'} } +sub bin_tar { return $PROGRAMS->{'tar'} if $PROGRAMS->{'tar'} } +sub bin_bunzip2 { return $PROGRAMS->{'bunzip2'} if $PROGRAMS->{'bunzip2'} } +sub bin_uncompress { return $PROGRAMS->{'uncompress'} + if $PROGRAMS->{'uncompress'} } +sub bin_unlzma { return $PROGRAMS->{'unlzma'} if $PROGRAMS->{'unlzma'} } +sub bin_unxz { return $PROGRAMS->{'unxz'} if $PROGRAMS->{'unxz'} } + +=head2 $bool = $ae->have_old_bunzip2 + +Older versions of C, from before the C release, +require all archive names to end in C<.bz2> or it will not extract +them. This method checks if you have a recent version of C +that allows any extension, or an older one that doesn't. + +=cut + +sub have_old_bunzip2 { + my $self = shift; + + ### no bunzip2? no old bunzip2 either :) + return unless $self->bin_bunzip2; + + ### if we can't run this, we can't be sure if it's too old or not + ### XXX stupid stupid stupid bunzip2 doesn't understand --version + ### is not a request to extract data: + ### $ bunzip2 --version + ### bzip2, a block-sorting file compressor. Version 1.0.2, 30-Dec-2001. + ### [...] + ### bunzip2: I won't read compressed data from a terminal. + ### bunzip2: For help, type: `bunzip2 --help'. + ### $ echo $? + ### 1 + ### HATEFUL! + + ### double hateful: bunzip2 --version also hangs if input is a pipe + ### See #32370: Archive::Extract will hang if stdin is a pipe [+PATCH] + ### So, we have to provide *another* argument which is a fake filename, + ### just so it wont try to read from stdin to print its version.. + ### *sigh* + ### Even if the file exists, it won't clobber or change it. + my $buffer; + scalar run( + command => [$self->bin_bunzip2, '--version', 'NoSuchFile'], + verbose => 0, + buffer => \$buffer + ); + + ### no output + return unless $buffer; + + my ($version) = $buffer =~ /version \s+ (\d+)/ix; + + return 1 if $version < 1; + return; +} + +################################# +# +# Untar code +# +################################# + +### annoying issue with (gnu) tar on win32, as illustrated by this +### bug: https://rt.cpan.org/Ticket/Display.html?id=40138 +### which shows that (gnu) tar will interpret a file name with a : +### in it as a remote file name, so C:\tmp\foo.txt is interpreted +### as a remote shell, and the extract fails. +{ my @ExtraTarFlags; + if( ON_WIN32 and my $cmd = __PACKAGE__->bin_tar ) { + $cmd = $1 if $cmd =~ m{^(.+)}s; # Tainted perl # + ### if this is gnu tar we are running, we need to use --force-local + push @ExtraTarFlags, '--force-local' if `$cmd --version` =~ /gnu tar/i; + } + + + ### use /bin/tar to extract ### + sub _untar_bin { + my $self = shift; + + ### check for /bin/tar ### + ### check for /bin/gzip if we need it ### + ### if any of the binaries are not available, return NA + { my $diag = !$self->bin_tar ? + loc("No '%1' program found", '/bin/tar') : + $self->is_tgz && !$self->bin_gzip ? + loc("No '%1' program found", '/bin/gzip') : + $self->is_tbz && !$self->bin_bunzip2 ? + loc("No '%1' program found", '/bin/bunzip2') : + $self->is_txz && !$self->bin_unxz ? + loc("No '%1' program found", '/bin/unxz') : + ''; + + if( $diag ) { + $self->_error( $diag ); + return METHOD_NA; + } + } + + ### XXX figure out how to make IPC::Run do this in one call -- + ### currently i don't know how to get output of a command after a pipe + ### trapped in a scalar. Mailed barries about this 5th of june 2004. + + ### see what command we should run, based on whether + ### it's a .tgz or .tar + + ### GNU tar can't handled VMS filespecs, but VMSTAR can handle Unix filespecs. + my $archive = $self->archive; + $archive = VMS::Filespec::unixify($archive) if ON_VMS; + + ### XXX solaris tar and bsdtar are having different outputs + ### depending whether you run with -x or -t + ### compensate for this insanity by running -t first, then -x + { my $cmd = + $self->is_tgz ? [$self->bin_gzip, '-c', '-d', '-f', $archive, '|', + $self->bin_tar, '-tf', '-'] : + $self->is_tbz ? [$self->bin_bunzip2, '-cd', $archive, '|', + $self->bin_tar, '-tf', '-'] : + $self->is_txz ? [$self->bin_unxz, '-cd', $archive, '|', + $self->bin_tar, '-tf', '-'] : + [$self->bin_tar, @ExtraTarFlags, '-tf', $archive]; + + ### run the command + ### newer versions of 'tar' (1.21 and up) now print record size + ### to STDERR as well if v OR t is given (used to be both). This + ### is a 'feature' according to the changelog, so we must now only + ### inspect STDOUT, otherwise, failures like these occur: + ### http://www.cpantesters.org/cpan/report/3230366 + my $buffer = ''; + my @out = run( command => $cmd, + buffer => \$buffer, + verbose => $DEBUG ); + + ### command was unsuccessful + unless( $out[0] ) { + return $self->_error(loc( + "Error listing contents of archive '%1': %2", + $archive, $buffer )); + } + + ### no buffers available? + if( !IPC::Cmd->can_capture_buffer and !$buffer ) { + $self->_error( $self->_no_buffer_files( $archive ) ); + + } else { + ### if we're on solaris we /might/ be using /bin/tar, which has + ### a weird output format... we might also be using + ### /usr/local/bin/tar, which is gnu tar, which is perfectly + ### fine... so we have to do some guessing here =/ + my @files = map { chomp; s!\x0D!!g if ON_WIN32; + !ON_SOLARIS ? $_ + : (m|^ x \s+ # 'xtract' -- sigh + (.+?), # the actual file name + \s+ [\d,.]+ \s bytes, + \s+ [\d,.]+ \s tape \s blocks + |x ? $1 : $_); + + ### only STDOUT, see above. Sometimes, extra whitespace + ### is present, so make sure we only pick lines with + ### a length + } grep { length } map { split $/, $_ } join '', @{$out[3]}; + + ### store the files that are in the archive ### + $self->files(\@files); + } + } + + ### now actually extract it ### + { my $cmd = + $self->is_tgz ? [$self->bin_gzip, '-c', '-d', '-f', $archive, '|', + $self->bin_tar, '-xf', '-'] : + $self->is_tbz ? [$self->bin_bunzip2, '-cd', $archive, '|', + $self->bin_tar, '-xf', '-'] : + $self->is_txz ? [$self->bin_unxz, '-cd', $archive, '|', + $self->bin_tar, '-xf', '-'] : + [$self->bin_tar, @ExtraTarFlags, '-xf', $archive]; + + my $buffer = ''; + unless( scalar run( command => $cmd, + buffer => \$buffer, + verbose => $DEBUG ) + ) { + return $self->_error(loc("Error extracting archive '%1': %2", + $archive, $buffer )); + } + + ### we might not have them, due to lack of buffers + if( $self->files ) { + ### now that we've extracted, figure out where we extracted to + my $dir = $self->__get_extract_dir( $self->files ); + + ### store the extraction dir ### + $self->extract_path( $dir ); + } + } + + ### we got here, no error happened + return 1; + } +} + + +### use archive::tar to extract ### +sub _untar_at { + my $self = shift; + + ### Loading Archive::Tar is going to set it to 1, so make it local + ### within this block, starting with its initial value. Whatever + ### Achive::Tar does will be undone when we return. + ### + ### Also, later, set $Archive::Tar::WARN to $Archive::Extract::WARN + ### so users don't have to even think about this variable. If they + ### do, they still get their set value outside of this call. + local $Archive::Tar::WARN = $Archive::Tar::WARN; + + ### we definitely need Archive::Tar, so load that first + { my $use_list = { 'Archive::Tar' => '0.0' }; + + unless( can_load( modules => $use_list ) ) { + + $self->_error(loc("You do not have '%1' installed - " . + "Please install it as soon as possible.", + 'Archive::Tar')); + + return METHOD_NA; + } + } + + ### we might pass it a filehandle if it's a .tbz file.. + my $fh_to_read = $self->archive; + + ### we will need Compress::Zlib too, if it's a tgz... and IO::Zlib + ### if A::T's version is 0.99 or higher + if( $self->is_tgz ) { + my $use_list = { 'Compress::Zlib' => '0.0' }; + { + local $@; + $use_list->{ 'IO::Zlib' } = '0.0' + if eval { Archive::Tar->VERSION('0.99'); 1 }; + } + + unless( can_load( modules => $use_list ) ) { + my $which = join '/', sort keys %$use_list; + + $self->_error(loc( + "You do not have '%1' installed - Please ". + "install it as soon as possible.", $which) + ); + + return METHOD_NA; + } + + } elsif ( $self->is_tbz ) { + my $use_list = { 'IO::Uncompress::Bunzip2' => '0.0' }; + unless( can_load( modules => $use_list ) ) { + $self->_error(loc( + "You do not have '%1' installed - Please " . + "install it as soon as possible.", + 'IO::Uncompress::Bunzip2') + ); + + return METHOD_NA; + } + + my $bz = IO::Uncompress::Bunzip2->new( $self->archive ) or + return $self->_error(loc("Unable to open '%1': %2", + $self->archive, + $IO::Uncompress::Bunzip2::Bunzip2Error)); + + $fh_to_read = $bz; + } elsif ( $self->is_txz ) { + my $use_list = { 'IO::Uncompress::UnXz' => '0.0' }; + unless( can_load( modules => $use_list ) ) { + $self->_error(loc( + "You do not have '%1' installed - Please " . + "install it as soon as possible.", + 'IO::Uncompress::UnXz') + ); + + return METHOD_NA; + } + + my $xz = IO::Uncompress::UnXz->new( $self->archive ) or + return $self->_error(loc("Unable to open '%1': %2", + $self->archive, + $IO::Uncompress::UnXz::UnXzError)); + + $fh_to_read = $xz; + } + + my @files; + { + ### $Archive::Tar::WARN is 1 by default in Archive::Tar, but we've + ### localized $Archive::Tar::WARN already. + $Archive::Tar::WARN = $Archive::Extract::WARN; + + ### only tell it it's compressed if it's a .tgz, as we give it a file + ### handle if it's a .tbz + my @read = ( $fh_to_read, ( $self->is_tgz ? 1 : 0 ) ); + + ### for version of Archive::Tar > 1.04 + local $Archive::Tar::CHOWN = 0; + + ### use the iterator if we can. it's a feature of A::T 1.40 and up + if ( $_ALLOW_TAR_ITER && Archive::Tar->can( 'iter' ) ) { + + my $next; + unless ( $next = Archive::Tar->iter( @read ) ) { + return $self->_error(loc( + "Unable to read '%1': %2", $self->archive, + $Archive::Tar::error)); + } + + while ( my $file = $next->() ) { + push @files, $file->full_path; + + $file->extract or return $self->_error(loc( + "Unable to read '%1': %2", + $self->archive, + $Archive::Tar::error)); + } + + ### older version, read the archive into memory + } else { + + my $tar = Archive::Tar->new(); + + unless( $tar->read( @read ) ) { + return $self->_error(loc("Unable to read '%1': %2", + $self->archive, $Archive::Tar::error)); + } + + ### workaround to prevent Archive::Tar from setting uid, which + ### is a potential security hole. -autrijus + ### have to do it here, since A::T needs to be /loaded/ first ### + { no strict 'refs'; local $^W; + + ### older versions of archive::tar <= 0.23 + *Archive::Tar::chown = sub {}; + } + + { local $^W; # quell 'splice() offset past end of array' warnings + # on older versions of A::T + + ### older archive::tar always returns $self, return value + ### slightly fux0r3d because of it. + $tar->extract or return $self->_error(loc( + "Unable to extract '%1': %2", + $self->archive, $Archive::Tar::error )); + } + + @files = $tar->list_files; + } + } + + my $dir = $self->__get_extract_dir( \@files ); + + ### store the files that are in the archive ### + $self->files(\@files); + + ### store the extraction dir ### + $self->extract_path( $dir ); + + ### check if the dir actually appeared ### + return 1 if -d $self->extract_path; + + ### no dir, we failed ### + return $self->_error(loc("Unable to extract '%1': %2", + $self->archive, $Archive::Tar::error )); +} + +################################# +# +# Gunzip code +# +################################# + +sub _gunzip_bin { + my $self = shift; + + ### check for /bin/gzip -- we need it ### + unless( $self->bin_gzip ) { + $self->_error(loc("No '%1' program found", '/bin/gzip')); + return METHOD_NA; + } + + my $fh = FileHandle->new('>'. $self->_gunzip_to) or + return $self->_error(loc("Could not open '%1' for writing: %2", + $self->_gunzip_to, $! )); + + my $cmd = [ $self->bin_gzip, '-c', '-d', '-f', $self->archive ]; + + my $buffer; + unless( scalar run( command => $cmd, + verbose => $DEBUG, + buffer => \$buffer ) + ) { + return $self->_error(loc("Unable to gunzip '%1': %2", + $self->archive, $buffer)); + } + + ### no buffers available? + if( !IPC::Cmd->can_capture_buffer and !$buffer ) { + $self->_error( $self->_no_buffer_content( $self->archive ) ); + } + + $self->_print($fh, $buffer) if defined $buffer; + + close $fh; + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + +sub _gunzip_cz { + my $self = shift; + + my $use_list = { 'Compress::Zlib' => '0.0' }; + unless( can_load( modules => $use_list ) ) { + $self->_error(loc("You do not have '%1' installed - Please " . + "install it as soon as possible.", 'Compress::Zlib')); + return METHOD_NA; + } + + my $gz = Compress::Zlib::gzopen( $self->archive, "rb" ) or + return $self->_error(loc("Unable to open '%1': %2", + $self->archive, $Compress::Zlib::gzerrno)); + + my $fh = FileHandle->new('>'. $self->_gunzip_to) or + return $self->_error(loc("Could not open '%1' for writing: %2", + $self->_gunzip_to, $! )); + + my $buffer; + $self->_print($fh, $buffer) while $gz->gzread($buffer) > 0; + $fh->close; + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + +################################# +# +# Uncompress code +# +################################# + +sub _uncompress_bin { + my $self = shift; + + ### check for /bin/gzip -- we need it ### + unless( $self->bin_uncompress ) { + $self->_error(loc("No '%1' program found", '/bin/uncompress')); + return METHOD_NA; + } + + my $fh = FileHandle->new('>'. $self->_gunzip_to) or + return $self->_error(loc("Could not open '%1' for writing: %2", + $self->_gunzip_to, $! )); + + my $cmd = [ $self->bin_uncompress, '-c', $self->archive ]; + + my $buffer; + unless( scalar run( command => $cmd, + verbose => $DEBUG, + buffer => \$buffer ) + ) { + return $self->_error(loc("Unable to uncompress '%1': %2", + $self->archive, $buffer)); + } + + ### no buffers available? + if( !IPC::Cmd->can_capture_buffer and !$buffer ) { + $self->_error( $self->_no_buffer_content( $self->archive ) ); + } + + $self->_print($fh, $buffer) if defined $buffer; + + close $fh; + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + + +################################# +# +# Unzip code +# +################################# + + +sub _unzip_bin { + my $self = shift; + + ### check for /bin/gzip if we need it ### + unless( $self->bin_unzip ) { + $self->_error(loc("No '%1' program found", '/bin/unzip')); + return METHOD_NA; + } + + ### first, get the files.. it must be 2 different commands with 'unzip' :( + { ### on VMS, capital letter options have to be quoted. This is + ### reported by John Malmberg on P5P Tue 21 Aug 2007 05:05:11 + ### Subject: [patch@31735]Archive Extract fix on VMS. + my $opt = ON_VMS ? '"-Z"' : '-Z'; + my $cmd = [ $self->bin_unzip, $opt, '-1', $self->archive ]; + + my $buffer = ''; + unless( scalar run( command => $cmd, + verbose => $DEBUG, + buffer => \$buffer ) + ) { + return $self->_error(loc("Unable to unzip '%1': %2", + $self->archive, $buffer)); + } + + ### no buffers available? + if( !IPC::Cmd->can_capture_buffer and !$buffer ) { + $self->_error( $self->_no_buffer_files( $self->archive ) ); + + } else { + ### Annoyingly, pesky MSWin32 can either have 'native' tools + ### which have \r\n line endings or Cygwin-based tools which + ### have \n line endings. Jan Dubois suggested using this fix + my $split = ON_WIN32 ? qr/\r?\n/ : "\n"; + $self->files( [split $split, $buffer] ); + } + } + + ### now, extract the archive ### + { my $cmd = [ $self->bin_unzip, '-qq', '-o', $self->archive ]; + + my $buffer; + unless( scalar run( command => $cmd, + verbose => $DEBUG, + buffer => \$buffer ) + ) { + return $self->_error(loc("Unable to unzip '%1': %2", + $self->archive, $buffer)); + } + + if( scalar @{$self->files} ) { + my $files = $self->files; + my $dir = $self->__get_extract_dir( $files ); + + $self->extract_path( $dir ); + } + } + + return 1; +} + +sub _unzip_az { + my $self = shift; + + my $use_list = { 'Archive::Zip' => '0.0' }; + unless( can_load( modules => $use_list ) ) { + $self->_error(loc("You do not have '%1' installed - Please " . + "install it as soon as possible.", 'Archive::Zip')); + return METHOD_NA; + } + + my $zip = Archive::Zip->new(); + + unless( $zip->read( $self->archive ) == &Archive::Zip::AZ_OK ) { + return $self->_error(loc("Unable to read '%1'", $self->archive)); + } + + my @files; + + + ### Address: #43278: Explicitly tell Archive::Zip where to put the files: + ### "In my BackPAN indexing, Archive::Zip was extracting things + ### in my script's directory instead of the current working directory. + ### I traced this back through Archive::Zip::_asLocalName which + ### eventually calls File::Spec::Win32::rel2abs which on Windows might + ### call Cwd::getdcwd. getdcwd returns the wrong directory in my + ### case, even though I think I'm on the same drive. + ### + ### To fix this, I pass the optional second argument to + ### extractMember using the cwd from Archive::Extract." --bdfoy + + ## store cwd() before looping; calls to cwd() can be expensive, and + ### it won't change during the loop + my $extract_dir = cwd(); + + ### have to extract every member individually ### + for my $member ($zip->members) { + push @files, $member->{fileName}; + + ### file to extract to, to avoid the above problem + my $to = File::Spec->catfile( $extract_dir, $member->{fileName} ); + + unless( $zip->extractMember($member, $to) == &Archive::Zip::AZ_OK ) { + return $self->_error(loc("Extraction of '%1' from '%2' failed", + $member->{fileName}, $self->archive )); + } + } + + my $dir = $self->__get_extract_dir( \@files ); + + ### set what files where extract, and where they went ### + $self->files( \@files ); + $self->extract_path( File::Spec->rel2abs($dir) ); + + return 1; +} + +sub __get_extract_dir { + my $self = shift; + my $files = shift || []; + + return unless scalar @$files; + + my($dir1, $dir2); + for my $aref ( [ \$dir1, 0 ], [ \$dir2, -1 ] ) { + my($dir,$pos) = @$aref; + + ### add a catdir(), so that any trailing slashes get + ### take care of (removed) + ### also, a catdir() normalises './dir/foo' to 'dir/foo'; + ### which was the problem in bug #23999 + my $res = -d $files->[$pos] + ? File::Spec->catdir( $files->[$pos], '' ) + : File::Spec->catdir( dirname( $files->[$pos] ) ); + + $$dir = $res; + } + + ### if the first and last dir don't match, make sure the + ### dirname is not set wrongly + my $dir; + + ### dirs are the same, so we know for sure what the extract dir is + if( $dir1 eq $dir2 ) { + $dir = $dir1; + + ### dirs are different.. do they share the base dir? + ### if so, use that, if not, fall back to '.' + } else { + my $base1 = [ File::Spec->splitdir( $dir1 ) ]->[0]; + my $base2 = [ File::Spec->splitdir( $dir2 ) ]->[0]; + + $dir = File::Spec->rel2abs( $base1 eq $base2 ? $base1 : '.' ); + } + + return File::Spec->rel2abs( $dir ); +} + +################################# +# +# Bunzip2 code +# +################################# + +sub _bunzip2_bin { + my $self = shift; + + ### check for /bin/gzip -- we need it ### + unless( $self->bin_bunzip2 ) { + $self->_error(loc("No '%1' program found", '/bin/bunzip2')); + return METHOD_NA; + } + + my $fh = FileHandle->new('>'. $self->_gunzip_to) or + return $self->_error(loc("Could not open '%1' for writing: %2", + $self->_gunzip_to, $! )); + + ### guard against broken bunzip2. See ->have_old_bunzip2() + ### for details + if( $self->have_old_bunzip2 and $self->archive !~ /\.bz2$/i ) { + return $self->_error(loc("Your bunzip2 version is too old and ". + "can only extract files ending in '%1'", + '.bz2')); + } + + my $cmd = [ $self->bin_bunzip2, '-cd', $self->archive ]; + + my $buffer; + unless( scalar run( command => $cmd, + verbose => $DEBUG, + buffer => \$buffer ) + ) { + return $self->_error(loc("Unable to bunzip2 '%1': %2", + $self->archive, $buffer)); + } + + ### no buffers available? + if( !IPC::Cmd->can_capture_buffer and !$buffer ) { + $self->_error( $self->_no_buffer_content( $self->archive ) ); + } + + $self->_print($fh, $buffer) if defined $buffer; + + close $fh; + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + +### using cz2, the compact versions... this we use mainly in archive::tar +### extractor.. +# sub _bunzip2_cz1 { +# my $self = shift; +# +# my $use_list = { 'IO::Uncompress::Bunzip2' => '0.0' }; +# unless( can_load( modules => $use_list ) ) { +# return $self->_error(loc("You do not have '%1' installed - Please " . +# "install it as soon as possible.", +# 'IO::Uncompress::Bunzip2')); +# } +# +# my $bz = IO::Uncompress::Bunzip2->new( $self->archive ) or +# return $self->_error(loc("Unable to open '%1': %2", +# $self->archive, +# $IO::Uncompress::Bunzip2::Bunzip2Error)); +# +# my $fh = FileHandle->new('>'. $self->_gunzip_to) or +# return $self->_error(loc("Could not open '%1' for writing: %2", +# $self->_gunzip_to, $! )); +# +# my $buffer; +# $fh->print($buffer) while $bz->read($buffer) > 0; +# $fh->close; +# +# ### set what files where extract, and where they went ### +# $self->files( [$self->_gunzip_to] ); +# $self->extract_path( File::Spec->rel2abs(cwd()) ); +# +# return 1; +# } + +sub _bunzip2_bz2 { + my $self = shift; + + my $use_list = { 'IO::Uncompress::Bunzip2' => '0.0' }; + unless( can_load( modules => $use_list ) ) { + $self->_error(loc("You do not have '%1' installed - Please " . + "install it as soon as possible.", + 'IO::Uncompress::Bunzip2')); + return METHOD_NA; + } + + IO::Uncompress::Bunzip2::bunzip2($self->archive => $self->_gunzip_to) + or return $self->_error(loc("Unable to uncompress '%1': %2", + $self->archive, + $IO::Uncompress::Bunzip2::Bunzip2Error)); + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + +################################# +# +# UnXz code +# +################################# + +sub _unxz_bin { + my $self = shift; + + ### check for /bin/unxz -- we need it ### + unless( $self->bin_unxz ) { + $self->_error(loc("No '%1' program found", '/bin/unxz')); + return METHOD_NA; + } + + my $fh = FileHandle->new('>'. $self->_gunzip_to) or + return $self->_error(loc("Could not open '%1' for writing: %2", + $self->_gunzip_to, $! )); + + my $cmd = [ $self->bin_unxz, '-c', '-d', '-f', $self->archive ]; + + my $buffer; + unless( scalar run( command => $cmd, + verbose => $DEBUG, + buffer => \$buffer ) + ) { + return $self->_error(loc("Unable to unxz '%1': %2", + $self->archive, $buffer)); + } + + ### no buffers available? + if( !IPC::Cmd->can_capture_buffer and !$buffer ) { + $self->_error( $self->_no_buffer_content( $self->archive ) ); + } + + $self->_print($fh, $buffer) if defined $buffer; + + close $fh; + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + +sub _unxz_cz { + my $self = shift; + + my $use_list = { 'IO::Uncompress::UnXz' => '0.0' }; + unless( can_load( modules => $use_list ) ) { + $self->_error(loc("You do not have '%1' installed - Please " . + "install it as soon as possible.", + 'IO::Uncompress::UnXz')); + return METHOD_NA; + } + + IO::Uncompress::UnXz::unxz($self->archive => $self->_gunzip_to) + or return $self->_error(loc("Unable to uncompress '%1': %2", + $self->archive, + $IO::Uncompress::UnXz::UnXzError)); + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + + +################################# +# +# unlzma code +# +################################# + +sub _unlzma_bin { + my $self = shift; + + ### check for /bin/unlzma -- we need it ### + unless( $self->bin_unlzma ) { + $self->_error(loc("No '%1' program found", '/bin/unlzma')); + return METHOD_NA; + } + + my $fh = FileHandle->new('>'. $self->_gunzip_to) or + return $self->_error(loc("Could not open '%1' for writing: %2", + $self->_gunzip_to, $! )); + + my $cmd = [ $self->bin_unlzma, '-c', $self->archive ]; + + my $buffer; + unless( scalar run( command => $cmd, + verbose => $DEBUG, + buffer => \$buffer ) + ) { + return $self->_error(loc("Unable to unlzma '%1': %2", + $self->archive, $buffer)); + } + + ### no buffers available? + if( !IPC::Cmd->can_capture_buffer and !$buffer ) { + $self->_error( $self->_no_buffer_content( $self->archive ) ); + } + + $self->_print($fh, $buffer) if defined $buffer; + + close $fh; + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + +sub _unlzma_cz { + my $self = shift; + + my $use_list1 = { 'IO::Uncompress::UnLzma' => '0.0' }; + my $use_list2 = { 'Compress::unLZMA' => '0.0' }; + + if (can_load( modules => $use_list1 ) ) { + IO::Uncompress::UnLzma::unlzma($self->archive => $self->_gunzip_to) + or return $self->_error(loc("Unable to uncompress '%1': %2", + $self->archive, + $IO::Uncompress::UnLzma::UnLzmaError)); + } + elsif (can_load( modules => $use_list2 ) ) { + + my $fh = FileHandle->new('>'. $self->_gunzip_to) or + return $self->_error(loc("Could not open '%1' for writing: %2", + $self->_gunzip_to, $! )); + + my $buffer; + $buffer = Compress::unLZMA::uncompressfile( $self->archive ); + unless ( defined $buffer ) { + return $self->_error(loc("Could not unlzma '%1': %2", + $self->archive, $@)); + } + + $self->_print($fh, $buffer) if defined $buffer; + + close $fh; + } + else { + $self->_error(loc("You do not have '%1' or '%2' installed - Please " . + "install it as soon as possible.", 'Compress::unLZMA', 'IO::Uncompress::UnLzma')); + return METHOD_NA; + } + + ### set what files where extract, and where they went ### + $self->files( [$self->_gunzip_to] ); + $self->extract_path( File::Spec->rel2abs(cwd()) ); + + return 1; +} + +##################################### +# +# unzip heuristics for FreeBSD-alikes +# +##################################### + +sub _is_infozip_esque { + my $unzip = shift; + + my @strings; + my $buf = ''; + + { + open my $file, '<', $unzip or die "$!\n"; + binmode $file; + local $/ = \1; + local $_; + while(<$file>) { + if ( m![[:print:]]! ) { + $buf .= $_; + next; + } + if ( $buf and m![^[:print:]]! ) { + push @strings, $buf if length $buf >= 4; + $buf = ''; + next; + } + } + } + push @strings, $buf if $buf; + foreach my $part ( @strings ) { + if ( $part =~ m!ZIPINFO! or $part =~ m!usage:.+?Z1! ) { + return $unzip; + } + } + return; +} + +################################# +# +# Error code +# +################################# + +# For printing binaries that avoids interfering globals +sub _print { + my $self = shift; + my $fh = shift; + + local( $\, $", $, ) = ( undef, ' ', '' ); + return print $fh @_; +} + +sub _error { + my $self = shift; + my $error = shift; + my $lerror = Carp::longmess($error); + + push @{$self->_error_msg}, $error; + push @{$self->_error_msg_long}, $lerror; + + ### set $Archive::Extract::WARN to 0 to disable printing + ### of errors + if( $WARN ) { + carp $DEBUG ? $lerror : $error; + } + + return; +} + +sub error { + my $self = shift; + + ### make sure we have a fallback aref + my $aref = do { + shift() + ? $self->_error_msg_long + : $self->_error_msg + } || []; + + return join $/, @$aref; +} + +=head2 debug( MESSAGE ) + +This method outputs MESSAGE to the default filehandle if C<$DEBUG> is +true. It's a small method, but it's here if you'd like to subclass it +so you can so something else with any debugging output. + +=cut + +### this is really a stub for subclassing +sub debug { + return unless $DEBUG; + + print $_[1]; +} + +sub _no_buffer_files { + my $self = shift; + my $file = shift or return; + return loc("No buffer captured, unable to tell ". + "extracted files or extraction dir for '%1'", $file); +} + +sub _no_buffer_content { + my $self = shift; + my $file = shift or return; + return loc("No buffer captured, unable to get content for '%1'", $file); +} +1; + +=pod + +=head1 UTILITY FUNCTION + +=head2 type_for($archive) + +Given an archive file name, it determins the type by parsing the file +name extension. Used by C when the C parameter is not passed. +Also useful when the archive file does not include a suffix but the file +name is otherwise known, such as when a file is uploaded to a web server +and stored with a temporary name that differs from the original name, and +you want to use the same detection pattern as Archive::Extract. Example: + + my $ae = Archive::Extract->new( + archive => '/tmp/02af6s', + type => Archive::Extract::type_for('archive.zip'), + ); + +=head1 HOW IT WORKS + +C tries first to determine what type of archive you +are passing it, by inspecting its suffix. It does not do this by using +Mime magic, or something related. See C below. + +Once it has determined the file type, it knows which extraction methods +it can use on the archive. It will try a perl solution first, then fall +back to a commandline tool if that fails. If that also fails, it will +return false, indicating it was unable to extract the archive. +See the section on C to see how to alter this order. + +=head1 CAVEATS + +=head2 File Extensions + +C trusts on the extension of the archive to determine +what type it is, and what extractor methods therefore can be used. If +your archives do not have any of the extensions as described in the +C method, you will have to specify the type explicitly, or +C will not be able to extract the archive for you. + +=head2 Supporting Very Large Files + +C can use either pure perl modules or command line +programs under the hood. Some of the pure perl modules (like +C and Compress::unLZMA) take the entire contents of the archive into memory, +which may not be feasible on your system. Consider setting the global +variable C<$Archive::Extract::PREFER_BIN> to C<1>, which will prefer +the use of command line programs and won't consume so much memory. + +See the C section below for details. + +=head2 Bunzip2 support of arbitrary extensions. + +Older versions of C do not support arbitrary file +extensions and insist on a C<.bz2> suffix. Although we do our best +to guard against this, if you experience a bunzip2 error, it may +be related to this. For details, please see the C +method. + +=head1 GLOBAL VARIABLES + +=head2 $Archive::Extract::DEBUG + +Set this variable to C to have all calls to command line tools +be printed out, including all their output. +This also enables C errors, instead of the regular +C errors. + +Good for tracking down why things don't work with your particular +setup. + +Defaults to C. + +=head2 $Archive::Extract::WARN + +This variable controls whether errors encountered internally by +C should be C'd or not. + +Set to false to silence warnings. Inspect the output of the C +method manually to see what went wrong. + +Defaults to C. + +=head2 $Archive::Extract::PREFER_BIN + +This variables controls whether C should prefer the +use of perl modules, or commandline tools to extract archives. + +Set to C to have C prefer commandline tools. + +Defaults to C. + +=head1 TODO / CAVEATS + +=over 4 + +=item Mime magic support + +Maybe this module should use something like C to determine +the type, rather than blindly trust the suffix. + +=item Thread safety + +Currently, C does a C to the extraction dir before +extraction, and a C back again after. This is not necessarily +thread safe. See C bug C<#45671> for details. + +=back + +=head1 BUG REPORTS + +Please report bugs or other issues to Ebug-archive-extract@rt.cpan.orgE. + +=head1 AUTHOR + +This module by Jos Boumans Ekane@cpan.orgE. + +=head1 COPYRIGHT + +This library is free software; you may redistribute and/or modify it +under the same terms as Perl itself. + +=cut + +# Local variables: +# c-indentation-style: bsd +# c-basic-offset: 4 +# indent-tabs-mode: nil +# End: +# vim: expandtab shiftwidth=4: + diff --git a/src/modules/DBM/Deep.pm b/src/modules/DBM/Deep.pm new file mode 100644 index 0000000..966b360 --- /dev/null +++ b/src/modules/DBM/Deep.pm @@ -0,0 +1,688 @@ +package DBM::Deep; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; +no warnings 'recursion'; + +our $VERSION = q(2.0013); + +use Scalar::Util (); + +use overload + ( + '""' => + '0+' => sub { $_[0] }, + )[0,2,1,2], # same sub for both + fallback => 1; + +use constant DEBUG => 0; + +use DBM::Deep::Engine; + +sub TYPE_HASH () { DBM::Deep::Engine->SIG_HASH } +sub TYPE_ARRAY () { DBM::Deep::Engine->SIG_ARRAY } + +my %obj_cache; # In external_refs mode, all objects are registered here, + # and dealt with in the END block at the bottom. +use constant HAVE_HUFH => scalar eval{ require Hash::Util::FieldHash }; +HAVE_HUFH and Hash::Util::FieldHash::fieldhash(%obj_cache); + +# This is used in all the children of this class in their TIE methods. +sub _get_args { + my $proto = shift; + + my $args; + if (scalar(@_) > 1) { + if ( @_ % 2 ) { + $proto->_throw_error( "Odd number of parameters to " . (caller(1))[2] ); + } + $args = {@_}; + } + elsif ( ref $_[0] ) { + unless ( eval { local $SIG{'__DIE__'}; %{$_[0]} || 1 } ) { + $proto->_throw_error( "Not a hashref in args to " . (caller(1))[2] ); + } + $args = $_[0]; + } + else { + $args = { file => shift }; + } + + return $args; +} + +# Class constructor method for Perl OO interface. +# Calls tie() and returns blessed reference to tied hash or array, +# providing a hybrid OO/tie interface. +sub new { + my $class = shift; + my $args = $class->_get_args( @_ ); + my $self; + + if (defined($args->{type}) && $args->{type} eq TYPE_ARRAY) { + $class = 'DBM::Deep::Array'; + require DBM::Deep::Array; + tie @$self, $class, %$args; + } + else { + $class = 'DBM::Deep::Hash'; + require DBM::Deep::Hash; + tie %$self, $class, %$args; + } + + return bless $self, $class; +} + +# This initializer is called from the various TIE* methods. new() calls tie(), +# which allows for a single point of entry. +sub _init { + my $class = shift; + my ($args) = @_; + + # locking implicitly enables autoflush + if ($args->{locking}) { $args->{autoflush} = 1; } + + # These are the defaults to be optionally overridden below + my $self = bless { + type => TYPE_HASH, + base_offset => undef, + staleness => undef, + engine => undef, + }, $class; + + unless ( exists $args->{engine} ) { + my $class = + exists $args->{dbi} ? 'DBM::Deep::Engine::DBI' : + exists $args->{_test} ? 'DBM::Deep::Engine::Test' : + 'DBM::Deep::Engine::File' ; + + eval "use $class"; die $@ if $@; + $args->{engine} = $class->new({ + %{$args}, + obj => $self, + }); + } + + # Grab the parameters we want to use + foreach my $param ( keys %$self ) { + next unless exists $args->{$param}; + $self->{$param} = $args->{$param}; + } + + eval { + local $SIG{'__DIE__'}; + + $self->lock_exclusive; + $self->_engine->setup( $self ); + $self->unlock; + }; if ( $@ ) { + my $e = $@; + eval { local $SIG{'__DIE__'}; $self->unlock; }; + die $e; + } + + if( $self->{engine}->{external_refs} + and my $sector = $self->{engine}->load_sector( $self->{base_offset} ) + ) { + $sector->increment_refcount; + + Scalar::Util::weaken( my $feeble_ref = $self ); + $obj_cache{ $self } = \$feeble_ref; + + # Make sure this cache is not a memory hog + if(!HAVE_HUFH) { + for(keys %obj_cache) { + delete $obj_cache{$_} if not ${$obj_cache{$_}}; + } + } + } + + return $self; +} + +sub TIEHASH { + shift; + require DBM::Deep::Hash; + return DBM::Deep::Hash->TIEHASH( @_ ); +} + +sub TIEARRAY { + shift; + require DBM::Deep::Array; + return DBM::Deep::Array->TIEARRAY( @_ ); +} + +sub lock_exclusive { + my $self = shift->_get_self; + return $self->_engine->lock_exclusive( $self, @_ ); +} +*lock = \&lock_exclusive; + +sub lock_shared { + my $self = shift->_get_self; + # cluck() the problem with cached File objects. + unless ( $self->_engine ) { + require Carp; + require Data::Dumper; + Carp::cluck( Data::Dumper->Dump( [$self], ['self'] ) ); + } + return $self->_engine->lock_shared( $self, @_ ); +} + +sub unlock { + my $self = shift->_get_self; + return $self->_engine->unlock( $self, @_ ); +} + +sub _copy_value { + my $self = shift->_get_self; + my ($spot, $value) = @_; + + if ( !ref $value ) { + ${$spot} = $value; + } + else { + my $r = Scalar::Util::reftype( $value ); + my $tied; + if ( $r eq 'ARRAY' ) { + $tied = tied(@$value); + } + elsif ( $r eq 'HASH' ) { + $tied = tied(%$value); + } + else { + __PACKAGE__->_throw_error( "Unknown type for '$value'" ); + } + + if ( eval { local $SIG{'__DIE__'}; $tied->isa( __PACKAGE__ ) } ) { + ${$spot} = $tied->_repr; + $tied->_copy_node( ${$spot} ); + } + else { + if ( $r eq 'ARRAY' ) { + ${$spot} = [ @{$value} ]; + } + else { + ${$spot} = { %{$value} }; + } + } + + my $c = Scalar::Util::blessed( $value ); + if ( defined $c && !$c->isa( __PACKAGE__ ) ) { + ${$spot} = bless ${$spot}, $c + } + } + + return 1; +} + +sub export { + my $self = shift->_get_self; + + my $temp = $self->_repr; + + $self->lock_exclusive; + $self->_copy_node( $temp ); + $self->unlock; + + my $classname = $self->_engine->get_classname( $self ); + if ( defined $classname ) { + bless $temp, $classname; + } + + return $temp; +} + +sub _check_legality { + my $self = shift; + my ($val) = @_; + + my $r = Scalar::Util::reftype( $val ); + + return $r if !defined $r || '' eq $r; + return $r if 'HASH' eq $r; + return $r if 'ARRAY' eq $r; + + __PACKAGE__->_throw_error( + "Storage of references of type '$r' is not supported." + ); +} + +sub import { + return if !ref $_[0]; # Perl calls import() on use -- ignore + + my $self = shift->_get_self; + my ($struct) = @_; + + my $type = $self->_check_legality( $struct ); + if ( !$type ) { + __PACKAGE__->_throw_error( "Cannot import a scalar" ); + } + + if ( substr( $type, 0, 1 ) ne $self->_type ) { + __PACKAGE__->_throw_error( + "Cannot import " . ('HASH' eq $type ? 'a hash' : 'an array') + . " into " . ('HASH' eq $type ? 'an array' : 'a hash') + ); + } + + my %seen; + my $recurse; + $recurse = sub { + my ($db, $val) = @_; + + my $obj = 'HASH' eq Scalar::Util::reftype( $db ) ? tied(%$db) : tied(@$db); + $obj ||= $db; + + my $r = $self->_check_legality( $val ); + if ( 'HASH' eq $r ) { + while ( my ($k, $v) = each %$val ) { + my $r = $self->_check_legality( $v ); + if ( $r ) { + my $temp = 'HASH' eq $r ? {} : []; + if ( my $c = Scalar::Util::blessed( $v ) ) { + bless $temp, $c; + } + $obj->put( $k, $temp ); + $recurse->( $temp, $v ); + } + else { + $obj->put( $k, $v ); + } + } + } + elsif ( 'ARRAY' eq $r ) { + foreach my $k ( 0 .. $#$val ) { + my $v = $val->[$k]; + my $r = $self->_check_legality( $v ); + if ( $r ) { + my $temp = 'HASH' eq $r ? {} : []; + if ( my $c = Scalar::Util::blessed( $v ) ) { + bless $temp, $c; + } + $obj->put( $k, $temp ); + $recurse->( $temp, $v ); + } + else { + $obj->put( $k, $v ); + } + } + } + }; + $recurse->( $self, $struct ); + + return 1; +} + +#XXX Need to keep track of who has a fh to this file in order to +#XXX close them all prior to optimize on Win32/cygwin +# Rebuild entire database into new file, then move +# it back on top of original. +sub optimize { + my $self = shift->_get_self; + + # Optimizing is only something we need to do when we're working with our + # own file format. Otherwise, let the other guy do the optimizations. + return unless $self->_engine->isa( 'DBM::Deep::Engine::File' ); + +#XXX Need to create a new test for this +# if ($self->_engine->storage->{links} > 1) { +# $self->_throw_error("Cannot optimize: reference count is greater than 1"); +# } + + #XXX Do we have to lock the tempfile? + + #XXX Should we use tempfile() here instead of a hard-coded name? + my $temp_filename = $self->_engine->storage->{file} . '.tmp'; + my $db_temp = __PACKAGE__->new( + file => $temp_filename, + type => $self->_type, + + # Bring over all the parameters that we need to bring over + ( map { $_ => $self->_engine->$_ } qw( + byte_size max_buckets data_sector_size num_txns + )), + ); + + $self->lock_exclusive; + $self->_engine->clear_cache; + $self->_copy_node( $db_temp ); + $self->unlock; + $db_temp->_engine->storage->close; + undef $db_temp; + + ## + # Attempt to copy user, group and permissions over to new file + ## + $self->_engine->storage->copy_stats( $temp_filename ); + + # q.v. perlport for more information on this variable + if ( $^O eq 'MSWin32' || $^O eq 'cygwin' ) { + ## + # Potential race condition when optimizing on Win32 with locking. + # The Windows filesystem requires that the filehandle be closed + # before it is overwritten with rename(). This could be redone + # with a soft copy. + ## + $self->unlock; + $self->_engine->storage->close; + } + + if (!rename $temp_filename, $self->_engine->storage->{file}) { + unlink $temp_filename; + $self->unlock; + $self->_throw_error("Optimize failed: Cannot copy temp file over original: $!"); + } + + $self->unlock; + $self->_engine->storage->close; + + $self->_engine->storage->open; + $self->lock_exclusive; + $self->_engine->setup( $self ); + $self->unlock; + + return 1; +} + +sub clone { + my $self = shift->_get_self; + + return __PACKAGE__->new( + type => $self->_type, + base_offset => $self->_base_offset, + staleness => $self->_staleness, + engine => $self->_engine, + ); +} + +sub supports { + my $self = shift->_get_self; + return $self->_engine->supports( @_ ); +} + +sub db_version { + shift->_get_self->_engine->db_version; +} + +#XXX Migrate this to the engine, where it really belongs and go through some +# API - stop poking in the innards of someone else.. +{ + my %is_legal_filter = map { + $_ => ~~1, + } qw( + store_key store_value + fetch_key fetch_value + ); + + sub set_filter { + my $self = shift->_get_self; + my $type = lc shift; + my $func = shift; + + if ( $is_legal_filter{$type} ) { + $self->_engine->storage->{"filter_$type"} = $func; + return 1; + } + + return; + } + + sub filter_store_key { $_[0]->set_filter( store_key => $_[1] ); } + sub filter_store_value { $_[0]->set_filter( store_value => $_[1] ); } + sub filter_fetch_key { $_[0]->set_filter( fetch_key => $_[1] ); } + sub filter_fetch_value { $_[0]->set_filter( fetch_value => $_[1] ); } +} + +sub begin_work { + my $self = shift->_get_self; + $self->lock_exclusive; + my $rv = eval { + local $SIG{'__DIE__'}; + $self->_engine->begin_work( $self, @_ ); + }; + my $e = $@; + $self->unlock; + die $e if $e; + return $rv; +} + +sub rollback { + my $self = shift->_get_self; + + $self->lock_exclusive; + my $rv = eval { + local $SIG{'__DIE__'}; + $self->_engine->rollback( $self, @_ ); + }; + my $e = $@; + $self->unlock; + die $e if $e; + return $rv; +} + +sub commit { + my $self = shift->_get_self; + $self->lock_exclusive; + my $rv = eval { + local $SIG{'__DIE__'}; + $self->_engine->commit( $self, @_ ); + }; + my $e = $@; + $self->unlock; + die $e if $e; + return $rv; +} + +# Accessor methods +sub _engine { + my $self = $_[0]->_get_self; + return $self->{engine}; +} + +sub _type { + my $self = $_[0]->_get_self; + return $self->{type}; +} + +sub _base_offset { + my $self = $_[0]->_get_self; + return $self->{base_offset}; +} + +sub _staleness { + my $self = $_[0]->_get_self; + return $self->{staleness}; +} + +# Utility methods +sub _throw_error { + my $n = 0; + while( 1 ) { + my @caller = caller( ++$n ); + next if $caller[0] =~ m/^DBM::Deep/; + + die "DBM::Deep: $_[1] at $caller[1] line $caller[2]\n"; + } +} + +# Store single hash key/value or array element in database. +sub STORE { + my $self = shift->_get_self; + my ($key, $value) = @_; + warn "STORE($self, '$key', '@{[defined$value?$value:'undef']}')\n" if DEBUG; + + unless ( $self->_engine->storage->is_writable ) { + $self->_throw_error( 'Cannot write to a readonly filehandle' ); + } + + $self->lock_exclusive; + + # User may be storing a complex value, in which case we do not want it run + # through the filtering system. + if ( !ref($value) && $self->_engine->storage->{filter_store_value} ) { + $value = $self->_engine->storage->{filter_store_value}->( $value ); + } + + eval { + local $SIG{'__DIE__'}; + $self->_engine->write_value( $self, $key, $value ); + }; if ( my $e = $@ ) { + $self->unlock; + die $e; + } + + $self->unlock; + + return 1; +} + +# Fetch single value or element given plain key or array index +sub FETCH { + my $self = shift->_get_self; + my ($key) = @_; + warn "FETCH($self, '$key')\n" if DEBUG; + + $self->lock_shared; + + my $result = $self->_engine->read_value( $self, $key ); + + $self->unlock; + + # Filters only apply to scalar values, so the ref check is making + # sure the fetched bucket is a scalar, not a child hash or array. + return ($result && !ref($result) && $self->_engine->storage->{filter_fetch_value}) + ? $self->_engine->storage->{filter_fetch_value}->($result) + : $result; +} + +# Delete single key/value pair or element given plain key or array index +sub DELETE { + my $self = shift->_get_self; + my ($key) = @_; + warn "DELETE($self, '$key')\n" if DEBUG; + + unless ( $self->_engine->storage->is_writable ) { + $self->_throw_error( 'Cannot write to a readonly filehandle' ); + } + + $self->lock_exclusive; + + ## + # Delete bucket + ## + my $value = $self->_engine->delete_key( $self, $key); + + if (defined $value && !ref($value) && $self->_engine->storage->{filter_fetch_value}) { + $value = $self->_engine->storage->{filter_fetch_value}->($value); + } + + $self->unlock; + + return $value; +} + +# Check if a single key or element exists given plain key or array index +sub EXISTS { + my $self = shift->_get_self; + my ($key) = @_; + warn "EXISTS($self, '$key')\n" if DEBUG; + + $self->lock_shared; + + my $result = $self->_engine->key_exists( $self, $key ); + + $self->unlock; + + return $result; +} + +# Clear all keys from hash, or all elements from array. +sub CLEAR { + my $self = shift->_get_self; + warn "CLEAR($self)\n" if DEBUG; + + my $engine = $self->_engine; + unless ( $engine->storage->is_writable ) { + $self->_throw_error( 'Cannot write to a readonly filehandle' ); + } + + $self->lock_exclusive; + eval { + local $SIG{'__DIE__'}; + $engine->clear( $self ); + }; + my $e = $@; + warn "$e\n" if $e && DEBUG; + + $self->unlock; + + die $e if $e; + + return 1; +} + +# Public method aliases +sub put { (shift)->STORE( @_ ) } +sub get { (shift)->FETCH( @_ ) } +sub store { (shift)->STORE( @_ ) } +sub fetch { (shift)->FETCH( @_ ) } +sub delete { (shift)->DELETE( @_ ) } +sub exists { (shift)->EXISTS( @_ ) } +sub clear { (shift)->CLEAR( @_ ) } + +sub _dump_file {shift->_get_self->_engine->_dump_file;} + +sub _warnif { + # There is, unfortunately, no way to avoid this hack. warnings.pm does not + # allow us to specify exactly the call frame we want. So, for now, we just + # look at the bitmask ourselves. + my $level; + { + my($pack, $file, $line, $bitmask) = (caller $level++)[0..2,9]; + redo if $pack =~ /^DBM::Deep(?:::|\z)/; + if( vec $bitmask, $warnings'Offsets{$_[0]}, 1, + || vec $bitmask, $warnings'Offsets{all}, 1, + ) { + my $msg = $_[1] =~ /\n\z/ ? $_[1] : "$_[1] at $file line $line.\n"; + die $msg + if vec $bitmask, $warnings'Offsets{$_[0]}+1, 1, + || vec $bitmask, $warnings'Offsets{all}+1, 1; + warn $msg; + } + } +} + +sub _free { + my $self = shift; + if(my $sector = $self->{engine}->load_sector( $self->{base_offset} )) { + $sector->free; + } +} + +sub DESTROY { + my $self = shift; + my $alter_ego = $self->_get_self; + if( !$alter_ego || $self != $alter_ego ) { + return; # Don’t run the destructor twice! (What follows only applies to + } # the inner object, not the tie.) + + # If the engine is gone, the END block has beaten us to it. + return if !$self->{engine}; + if( $self->{engine}->{external_refs} ) { + $self->_free; + } +} + +# Relying on the destructor alone is problematic, as the order in which +# objects are discarded is random in global destruction. So we do the +# clean-up here before preemptively before global destruction. +END { + defined $$_ and $$_->_free, delete $$_->{engine} + for(values %obj_cache); +} + +1; +__END__ diff --git a/src/modules/DBM/Deep.pod b/src/modules/DBM/Deep.pod new file mode 100644 index 0000000..cd456de --- /dev/null +++ b/src/modules/DBM/Deep.pod @@ -0,0 +1,1330 @@ +=head1 NAME + +DBM::Deep - A pure perl multi-level hash/array DBM that supports transactions + +=head1 VERSION + +2.0013 + +=head1 SYNOPSIS + + use DBM::Deep; + my $db = DBM::Deep->new( "foo.db" ); + + $db->{key} = 'value'; + print $db->{key}; + + $db->put('key' => 'value'); + print $db->get('key'); + + # true multi-level support + $db->{my_complex} = [ + 'hello', { perl => 'rules' }, + 42, 99, + ]; + + $db->begin_work; + + # Do stuff here + + $db->rollback; + $db->commit; + + tie my %db, 'DBM::Deep', 'foo.db'; + $db{key} = 'value'; + print $db{key}; + + tied(%db)->put('key' => 'value'); + print tied(%db)->get('key'); + +=head1 DESCRIPTION + +A unique flat-file database module, written in pure perl. True multi-level +hash/array support (unlike MLDBM, which is faked), hybrid OO / tie() +interface, cross-platform FTPable files, ACID transactions, and is quite fast. +Can handle millions of keys and unlimited levels without significant +slow-down. Written from the ground-up in pure perl -- this is NOT a wrapper +around a C-based DBM. Out-of-the-box compatibility with Unix, Mac OS X and +Windows. + +=head1 VERSION DIFFERENCES + +B: 2.0000 introduces Unicode support in the File back end. This +necessitates a change in the file format. The version 1.0003 format is +still supported, though, so we have added a L +method. If you are using a database in the old format, you will have to +upgrade it to get Unicode support. + +B: 1.0020 introduces different engines which are backed by different types +of storage. There is the original storage (called 'File') and a database storage +(called 'DBI'). q.v. L for more information. + +B: 1.0000 has significant file format differences from prior versions. +There is a backwards-compatibility layer at C. Files +created by 1.0000 or higher are B compatible with scripts using prior +versions. + +=head1 PLUGINS + +DBM::Deep is a wrapper around different storage engines. These are: + +=head2 File + +This is the traditional storage engine, storing the data to a custom file +format. The parameters accepted are: + +=over 4 + +=item * file + +Filename of the DB file to link the handle to. You can pass a full absolute +filesystem path, partial path, or a plain filename if the file is in the +current working directory. This is a required parameter (though q.v. fh). + +=item * fh + +If you want, you can pass in the fh instead of the file. This is most useful for +doing something like: + + my $db = DBM::Deep->new( { fh => \*DATA } ); + +You are responsible for making sure that the fh has been opened appropriately +for your needs. If you open it read-only and attempt to write, an exception will +be thrown. If you open it write-only or append-only, an exception will be thrown +immediately as DBM::Deep needs to read from the fh. + +=item * file_offset + +This is the offset within the file that the DBM::Deep db starts. Most of the +time, you will not need to set this. However, it's there if you want it. + +If you pass in fh and do not set this, it will be set appropriately. + +=item * locking + +Specifies whether locking is to be enabled. DBM::Deep uses Perl's flock() +function to lock the database in exclusive mode for writes, and shared mode +for reads. Pass any true value to enable. This affects the base DB handle +I that use the same DB file. This is an +optional parameter, and defaults to 1 (enabled). See L below for +more. + +=back + +When you open an existing database file, the version of the database format +will stay the same. But if you are creating a new file, it will be in the +latest format. + +=head2 DBI + +This is a storage engine that stores the data in a relational database. Funnily +enough, this engine doesn't work with transactions (yet) as InnoDB doesn't do +what DBM::Deep needs it to do. + +The parameters accepted are: + +=over 4 + +=item * dbh + +This is a DBH that's already been opened with L. + +=item * dbi + +This is a hashref containing: + +=over 4 + +=item * dsn + +=item * username + +=item * password + +=item * connect_args + +=back + +These correspond to the 4 parameters L takes. + +=back + +B: This has only been tested with MySQL and SQLite (with +disappointing results). I plan on extending this to work with PostgreSQL in +the near future. Oracle, Sybase, and other engines will come later. + +=head2 Planned engines + +There are plans to extend this functionality to (at least) the following: + +=over 4 + +=item * BDB (and other hash engines like memcached) + +=item * NoSQL engines (such as Tokyo Cabinet) + +=item * DBIx::Class (and other ORMs) + +=back + +=head1 SETUP + +Construction can be done OO-style (which is the recommended way), or using +Perl's tie() function. Both are examined here. + +=head2 OO Construction + +The recommended way to construct a DBM::Deep object is to use the new() +method, which gets you a blessed I tied hash (or array) reference. + + my $db = DBM::Deep->new( "foo.db" ); + +This opens a new database handle, mapped to the file "foo.db". If this +file does not exist, it will automatically be created. DB files are +opened in "r+" (read/write) mode, and the type of object returned is a +hash, unless otherwise specified (see L below). + +You can pass a number of options to the constructor to specify things like +locking, autoflush, etc. This is done by passing an inline hash (or hashref): + + my $db = DBM::Deep->new( + file => "foo.db", + locking => 1, + autoflush => 1 + ); + +Notice that the filename is now specified I the hash with +the "file" parameter, as opposed to being the sole argument to the +constructor. This is required if any options are specified. +See L below for the complete list. + +You can also start with an array instead of a hash. For this, you must +specify the C parameter: + + my $db = DBM::Deep->new( + file => "foo.db", + type => DBM::Deep->TYPE_ARRAY + ); + +B Specifying the C parameter only takes effect when beginning +a new DB file. If you create a DBM::Deep object with an existing file, the +C will be loaded from the file header, and an error will be thrown if +the wrong type is passed in. + +=head2 Tie Construction + +Alternately, you can create a DBM::Deep handle by using Perl's built-in +tie() function. The object returned from tie() can be used to call methods, +such as lock() and unlock(). (That object can be retrieved from the tied +variable at any time using tied() - please see L for more info.) + + my %hash; + my $db = tie %hash, "DBM::Deep", "foo.db"; + + my @array; + my $db = tie @array, "DBM::Deep", "bar.db"; + +As with the OO constructor, you can replace the DB filename parameter with +a hash containing one or more options (see L just below for the +complete list). + + tie %hash, "DBM::Deep", { + file => "foo.db", + locking => 1, + autoflush => 1 + }; + +=head2 Options + +There are a number of options that can be passed in when constructing your +DBM::Deep objects. These apply to both the OO- and tie- based approaches. + +=over + +=item * type + +This parameter specifies what type of object to create, a hash or array. Use +one of these two constants: + +=over 4 + +=item * C<< DBM::Deep->TYPE_HASH >> + +=item * C<< DBM::Deep->TYPE_ARRAY >> + +=back + +This only takes effect when beginning a new file. This is an optional +parameter, and defaults to C<< DBM::Deep->TYPE_HASH >>. + +=item * autoflush + +Specifies whether autoflush is to be enabled on the underlying filehandle. +This obviously slows down write operations, but is required if you may have +multiple processes accessing the same DB file (also consider enable I). +Pass any true value to enable. This is an optional parameter, and defaults to 1 +(enabled). + +=item * filter_* + +See L below. + +=back + +The following parameters may be specified in the constructor the first time the +datafile is created. However, they will be stored in the header of the file and +cannot be overridden by subsequent openings of the file - the values will be set +from the values stored in the datafile's header. + +=over 4 + +=item * num_txns + +This is the number of transactions that can be running at one time. The +default is one - the HEAD. The minimum is one and the maximum is 255. The more +transactions, the larger and quicker the datafile grows. + +Simple access to a database, regardless of how many processes are doing it, +already counts as one transaction (the HEAD). So, if you want, say, 5 +processes to be able to call begin_work at the same time, C must +be at least 6. + +See L below. + +=item * max_buckets + +This is the number of entries that can be added before a reindexing. The larger +this number is made, the larger a file gets, but the better performance you will +have. The default and minimum number this can be is 16. The maximum is 256, but +more than 64 isn't recommended. + +=item * data_sector_size + +This is the size in bytes of a given data sector. Data sectors will chain, so +a value of any size can be stored. However, chaining is expensive in terms of +time. Setting this value to something close to the expected common length of +your scalars will improve your performance. If it is too small, your file will +have a lot of chaining. If it is too large, your file will have a lot of dead +space in it. + +The default for this is 64 bytes. The minimum value is 32 and the maximum is +256 bytes. + +B There are between 6 and 10 bytes taken up in each data sector for +bookkeeping. (It's 4 + the number of bytes in your L.) This is +included within the data_sector_size, thus the effective value is 6-10 bytes +less than what you specified. + +B If your strings contain any characters beyond the byte +range, they will be encoded as UTF-8 before being stored in the file. This +will make all non-ASCII characters take up more than one byte each. + +=item * pack_size + +This is the size of the file pointer used throughout the file. The valid values +are: + +=over 4 + +=item * small + +This uses 2-byte offsets, allowing for a maximum file size of 65 KB. + +=item * medium (default) + +This uses 4-byte offsets, allowing for a maximum file size of 4 GB. + +=item * large + +This uses 8-byte offsets, allowing for a maximum file size of 16 XB +(exabytes). This can only be enabled if your Perl is compiled for 64-bit. + +=back + +See L for more information. + +=item * external_refs + +This is a boolean option. When enabled, it allows external references to +database entries to hold on to those entries, even when they are deleted. + +To illustrate, if you retrieve a hash (or array) reference from the +database, + + $foo_hash = $db->{foo}; + +the hash reference is still tied to the database. So if you + + delete $db->{foo}; + +C<$foo_hash> will point to a location in the DB that is no longer valid (we +call this a stale reference). So if you try to retrieve the data from +C<$foo_hash>, + + for(keys %$foo_hash) { + +you will get an error. + +The C option causes C<$foo_hash> to 'hang on' to the +DB entry, so it will not be deleted from the database if there is still a +reference to it in a running program. It will be deleted, instead, when the +C<$foo_hash> variable no longer exists, or is overwritten. + +This has the potential to cause database bloat if your program crashes, so +it is not enabled by default. (See also the L method for an +alternative workaround.) + +=back + +=head1 TIE INTERFACE + +With DBM::Deep you can access your databases using Perl's standard hash/array +syntax. Because all DBM::Deep objects are I to hashes or arrays, you can +treat them as such (but see L, above, and +L, below). DBM::Deep will intercept +all reads/writes and direct them +to the right place -- the DB file. This has nothing to do with the +L section above. This simply tells you how to use DBM::Deep +using regular hashes and arrays, rather than calling functions like C +and C (although those work too). It is entirely up to you how to want +to access your databases. + +=head2 Hashes + +You can treat any DBM::Deep object like a normal Perl hash reference. Add keys, +or even nested hashes (or arrays) using standard Perl syntax: + + my $db = DBM::Deep->new( "foo.db" ); + + $db->{mykey} = "myvalue"; + $db->{myhash} = {}; + $db->{myhash}->{subkey} = "subvalue"; + + print $db->{myhash}->{subkey} . "\n"; + +You can even step through hash keys using the normal Perl C function: + + foreach my $key (keys %$db) { + print "$key: " . $db->{$key} . "\n"; + } + +Remember that Perl's C function extracts I key from the hash and +pushes them onto an array, all before the loop even begins. If you have an +extremely large hash, this may exhaust Perl's memory. Instead, consider using +Perl's C function, which pulls keys/values one at a time, using very +little memory: + + while (my ($key, $value) = each %$db) { + print "$key: $value\n"; + } + +Please note that when using C, you should always pass a direct +hash reference, not a lookup. Meaning, you should B do this: + + # NEVER DO THIS + while (my ($key, $value) = each %{$db->{foo}}) { # BAD + +This causes an infinite loop, because for each iteration, Perl is calling +FETCH() on the $db handle, resulting in a "new" hash for foo every time, so +it effectively keeps returning the first key over and over again. Instead, +assign a temporary variable to C<< $db->{foo} >>, then pass that to each(). + +=head2 Arrays + +As with hashes, you can treat any DBM::Deep object like a normal Perl array +reference. This includes inserting, removing and manipulating elements, +and the C, C, C, C and C functions. +The object must have first been created using type +C<< DBM::Deep->TYPE_ARRAY >>, +or simply be a nested array reference inside a hash. Example: + + my $db = DBM::Deep->new( + file => "foo-array.db", + type => DBM::Deep->TYPE_ARRAY + ); + + $db->[0] = "foo"; + push @$db, "bar", "baz"; + unshift @$db, "bah"; + + my $last_elem = pop @$db; # baz + my $first_elem = shift @$db; # bah + my $second_elem = $db->[1]; # bar + + my $num_elements = scalar @$db; + +=head1 OO INTERFACE + +In addition to the I interface, you can also use a standard OO interface +to manipulate all aspects of DBM::Deep databases. Each type of object (hash or +array) has its own methods, but both types share the following common methods: +C, C, C, C and C. C and +C are aliases to C and C, respectively. + +=over + +=item * new() / clone() +X +X + +These are the constructor and copy-functions. + +=item * put() / store() +X +X + +Stores a new hash key/value pair, or sets an array element value. Takes two +arguments, the hash key or array index, and the new value. The value can be +a scalar, hash ref or array ref. Returns true on success, false on failure. + + $db->put("foo", "bar"); # for hashes + $db->put(1, "bar"); # for arrays + +=item * get() / fetch() +X +X + +Fetches the value of a hash key or array element. Takes one argument: the hash +key or array index. Returns a scalar, hash ref or array ref, depending on the +data type stored. + + my $value = $db->get("foo"); # for hashes + my $value = $db->get(1); # for arrays + +=item * exists() +X + +Checks if a hash key or array index exists. Takes one argument: the hash key +or array index. Returns true if it exists, false if not. + + if ($db->exists("foo")) { print "yay!\n"; } # for hashes + if ($db->exists(1)) { print "yay!\n"; } # for arrays + +=item * delete() +X + +Deletes one hash key/value pair or array element. Takes one argument: the hash +key or array index. Returns the data that the element used to contain (just +like Perl's C function), which is C if it did not exist. For +arrays, the remaining elements located after the deleted element are NOT +moved over. The deleted element is essentially just undefined, which is +exactly how Perl's +internal arrays work. + + $db->delete("foo"); # for hashes + $db->delete(1); # for arrays + +=item * clear() +X + +Deletes B hash keys or array elements. Takes no arguments. No return +value. + + $db->clear(); # hashes or arrays + +=item * lock() / unlock() / lock_exclusive() / lock_shared() +X +X +X +X + +q.v. L for more info. + +=item * optimize() +X + +This will compress the datafile so that it takes up as little space as possible. +There is a freespace manager so that when space is freed up, it is used before +extending the size of the datafile. But, that freespace just sits in the +datafile unless C is called. + +C basically copies everything into a new database, so, if it is +in version 1.0003 format, it will be upgraded. + +=item * import() +X + +Unlike simple assignment, C does not tie the right-hand side. Instead, +a copy of your data is put into the DB. C takes either an arrayref (if +your DB is an array) or a hashref (if your DB is a hash). C will die +if anything else is passed in. + +=item * export() +X + +This returns a complete copy of the data structure at the point you do the export. +This copy is in RAM, not on disk like the DB is. + +=item * begin_work() / commit() / rollback() + +These are the transactional functions. L for more information. + +=item * supports( $option ) +X + +This returns a boolean indicating whether this instance of DBM::Deep +supports that feature. C<$option> can be one of: + +=over 4 + +=item * transactions +X + +=item * unicode +X + +=back + +=item * db_version() +X + +This returns the version of the database format that the current database +is in. This is specified as the earliest version of DBM::Deep that supports +it. + +For the File back end, this will be 1.0003 or 2. + +For the DBI back end, it is currently always 1.0020. + +=back + +=head2 Hashes + +For hashes, DBM::Deep supports all the common methods described above, and the +following additional methods: C and C. + +=over + +=item * first_key() +X + +Returns the "first" key in the hash. As with built-in Perl hashes, keys are +fetched in an undefined order (which appears random). Takes no arguments, +returns the key as a scalar value. + + my $key = $db->first_key(); + +=item * next_key() +X + +Returns the "next" key in the hash, given the previous one as the sole argument. +Returns undef if there are no more keys to be fetched. + + $key = $db->next_key($key); + +=back + +Here are some examples of using hashes: + + my $db = DBM::Deep->new( "foo.db" ); + + $db->put("foo", "bar"); + print "foo: " . $db->get("foo") . "\n"; + + $db->put("baz", {}); # new child hash ref + $db->get("baz")->put("buz", "biz"); + print "buz: " . $db->get("baz")->get("buz") . "\n"; + + my $key = $db->first_key(); + while ($key) { + print "$key: " . $db->get($key) . "\n"; + $key = $db->next_key($key); + } + + if ($db->exists("foo")) { $db->delete("foo"); } + +=head2 Arrays + +For arrays, DBM::Deep supports all the common methods described above, and the +following additional methods: C, C, C, C, +C and C. + +=over + +=item * length() +X + +Returns the number of elements in the array. Takes no arguments. + + my $len = $db->length(); + +=item * push() +X + +Adds one or more elements onto the end of the array. Accepts scalars, hash +refs or array refs. No return value. + + $db->push("foo", "bar", {}); + +=item * pop() +X + +Fetches the last element in the array, and deletes it. Takes no arguments. +Returns undef if array is empty. Returns the element value. + + my $elem = $db->pop(); + +=item * shift() +X + +Fetches the first element in the array, deletes it, then shifts all the +remaining elements over to take up the space. Returns the element value. This +method is not recommended with large arrays -- see L below for +details. + + my $elem = $db->shift(); + +=item * unshift() +X + +Inserts one or more elements onto the beginning of the array, shifting all +existing elements over to make room. Accepts scalars, hash refs or array refs. +No return value. This method is not recommended with large arrays -- see + below for details. + + $db->unshift("foo", "bar", {}); + +=item * splice() +X + +Performs exactly like Perl's built-in function of the same name. See +L for usage -- it is too complicated to document here. This +method is not recommended with large arrays -- see L below for +details. + +=back + +Here are some examples of using arrays: + + my $db = DBM::Deep->new( + file => "foo.db", + type => DBM::Deep->TYPE_ARRAY + ); + + $db->push("bar", "baz"); + $db->unshift("foo"); + $db->put(3, "buz"); + + my $len = $db->length(); + print "length: $len\n"; # 4 + + for (my $k=0; $k<$len; $k++) { + print "$k: " . $db->get($k) . "\n"; + } + + $db->splice(1, 2, "biz", "baf"); + + while (my $elem = shift @$db) { + print "shifted: $elem\n"; + } + +=head1 LOCKING + +Enable or disable automatic file locking by passing a boolean value to the +C parameter when constructing your DBM::Deep object (see L +above). + + my $db = DBM::Deep->new( + file => "foo.db", + locking => 1 + ); + +This causes DBM::Deep to C the underlying filehandle with exclusive +mode for writes, and shared mode for reads. This is required if you have +multiple processes accessing the same database file, to avoid file corruption. +Please note that C does NOT work for files over NFS. See L below for more. + +=head2 Explicit Locking + +You can explicitly lock a database, so it remains locked for multiple +actions. This is done by calling the C method (for when you +want to write) or the C method (for when you want to read). +This is particularly useful for things like counters, where the current value +needs to be fetched, then incremented, then stored again. + + $db->lock_exclusive(); + my $counter = $db->get("counter"); + $counter++; + $db->put("counter", $counter); + $db->unlock(); + + # or... + + $db->lock_exclusive(); + $db->{counter}++; + $db->unlock(); + +=head2 Win32/Cygwin + +Due to Win32 actually enforcing the read-only status of a shared lock, all +locks on Win32 and cygwin are exclusive. This is because of how autovivification +currently works. Hopefully, this will go away in a future release. + +=head1 IMPORTING/EXPORTING + +You can import existing complex structures by calling the C method, +and export an entire database into an in-memory structure using the C +method. Both are examined here. + +=head2 Importing + +Say you have an existing hash with nested hashes/arrays inside it. Instead of +walking the structure and adding keys/elements to the database as you go, +simply pass a reference to the C method. This recursively adds +everything to an existing DBM::Deep object for you. Here is an example: + + my $struct = { + key1 => "value1", + key2 => "value2", + array1 => [ "elem0", "elem1", "elem2" ], + hash1 => { + subkey1 => "subvalue1", + subkey2 => "subvalue2" + } + }; + + my $db = DBM::Deep->new( "foo.db" ); + $db->import( $struct ); + + print $db->{key1} . "\n"; # prints "value1" + +This recursively imports the entire C<$struct> object into C<$db>, including +all nested hashes and arrays. If the DBM::Deep object contains existing data, +keys are merged with the existing ones, replacing if they already exist. +The C method can be called on any database level (not just the base +level), and works with both hash and array DB types. + +B Make sure your existing structure has no circular references in it. +These will cause an infinite loop when importing. There are plans to fix this +in a later release. + +=head2 Exporting + +Calling the C method on an existing DBM::Deep object will return +a reference to a new in-memory copy of the database. The export is done +recursively, so all nested hashes/arrays are all exported to standard Perl +objects. Here is an example: + + my $db = DBM::Deep->new( "foo.db" ); + + $db->{key1} = "value1"; + $db->{key2} = "value2"; + $db->{hash1} = {}; + $db->{hash1}->{subkey1} = "subvalue1"; + $db->{hash1}->{subkey2} = "subvalue2"; + + my $struct = $db->export(); + + print $struct->{key1} . "\n"; # prints "value1" + +This makes a complete copy of the database in memory, and returns a reference +to it. The C method can be called on any database level (not just +the base level), and works with both hash and array DB types. Be careful of +large databases -- you can store a lot more data in a DBM::Deep object than an +in-memory Perl structure. + +B Make sure your database has no circular references in it. +These will cause an infinite loop when exporting. There are plans to fix this +in a later release. + +=head1 FILTERS + +DBM::Deep has a number of hooks where you can specify your own Perl function +to perform filtering on incoming or outgoing data. This is a perfect +way to extend the engine, and implement things like real-time compression or +encryption. Filtering applies to the base DB level, and all child hashes / +arrays. Filter hooks can be specified when your DBM::Deep object is first +constructed, or by calling the C method at any time. There are +four available filter hooks. + +=head2 set_filter() + +This method takes two parameters - the filter type and the filter subreference. +The four types are: + +=over + +=item * filter_store_key + +This filter is called whenever a hash key is stored. It +is passed the incoming key, and expected to return a transformed key. + +=item * filter_store_value + +This filter is called whenever a hash key or array element is stored. It +is passed the incoming value, and expected to return a transformed value. + +=item * filter_fetch_key + +This filter is called whenever a hash key is fetched (i.e. via +C or C). It is passed the transformed key, +and expected to return the plain key. + +=item * filter_fetch_value + +This filter is called whenever a hash key or array element is fetched. +It is passed the transformed value, and expected to return the plain value. + +=back + +Here are the two ways to setup a filter hook: + + my $db = DBM::Deep->new( + file => "foo.db", + filter_store_value => \&my_filter_store, + filter_fetch_value => \&my_filter_fetch + ); + + # or... + + $db->set_filter( "store_value", \&my_filter_store ); + $db->set_filter( "fetch_value", \&my_filter_fetch ); + +Your filter function will be called only when dealing with SCALAR keys or +values. When nested hashes and arrays are being stored/fetched, filtering +is bypassed. Filters are called as static functions, passed a single SCALAR +argument, and expected to return a single SCALAR value. If you want to +remove a filter, set the function reference to C: + + $db->set_filter( "store_value", undef ); + +=head2 Examples + +Please read L for examples of filters. + +=head1 ERROR HANDLING + +Most DBM::Deep methods return a true value for success, and call die() on +failure. You can wrap calls in an eval block to catch the die. + + my $db = DBM::Deep->new( "foo.db" ); # create hash + eval { $db->push("foo"); }; # ILLEGAL -- push is array-only call + + print $@; # prints error message + +=head1 LARGEFILE SUPPORT + +If you have a 64-bit system, and your Perl is compiled with both LARGEFILE +and 64-bit support, you I be able to create databases larger than 4 GB. +DBM::Deep by default uses 32-bit file offset tags, but these can be changed +by specifying the 'pack_size' parameter when constructing the file. + + DBM::Deep->new( + file => $filename, + pack_size => 'large', + ); + +This tells DBM::Deep to pack all file offsets with 8-byte (64-bit) quad words +instead of 32-bit longs. After setting these values your DB files have a +theoretical maximum size of 16 XB (exabytes). + +You can also use C<< pack_size => 'small' >> in order to use 16-bit file +offsets. + +B Changing these values will B work for existing database files. +Only change this for new files. Once the value has been set, it is stored in +the file's header and cannot be changed for the life of the file. These +parameters are per-file, meaning you can access 32-bit and 64-bit files, as +you choose. + +B We have not personally tested files larger than 4 GB -- all our +systems have only a 32-bit Perl. However, we have received user reports that +this does indeed work. + +=head1 LOW-LEVEL ACCESS + +If you require low-level access to the underlying filehandle that DBM::Deep uses, +you can call the C<_fh()> method, which returns the handle: + + my $fh = $db->_fh(); + +This method can be called on the root level of the database, or any child +hashes or arrays. All levels share a I structure, which contains things +like the filehandle, a reference counter, and all the options specified +when you created the object. You can get access to this file object by +calling the C<_storage()> method. + + my $file_obj = $db->_storage(); + +This is useful for changing options after the object has already been created, +such as enabling/disabling locking. You can also store your own temporary user +data in this structure (be wary of name collision), which is then accessible from +any child hash or array. + +=head1 CIRCULAR REFERENCES + +DBM::Deep has full support for circular references. Meaning you +can have a nested hash key or array element that points to a parent object. +This relationship is stored in the DB file, and is preserved between sessions. +Here is an example: + + my $db = DBM::Deep->new( "foo.db" ); + + $db->{foo} = "bar"; + $db->{circle} = $db; # ref to self + + print $db->{foo} . "\n"; # prints "bar" + print $db->{circle}->{foo} . "\n"; # prints "bar" again + +This also works as expected with array and hash references. So, the following +works as expected: + + $db->{foo} = [ 1 .. 3 ]; + $db->{bar} = $db->{foo}; + + push @{$db->{foo}}, 42; + is( $db->{bar}[-1], 42 ); # Passes + +This, however, does I extend to assignments from one DB file to another. +So, the following will throw an error: + + my $db1 = DBM::Deep->new( "foo.db" ); + my $db2 = DBM::Deep->new( "bar.db" ); + + $db1->{foo} = []; + $db2->{foo} = $db1->{foo}; # dies + +B: Passing the object to a function that recursively walks the +object tree (such as I or even the built-in C or +C methods) will result in an infinite loop. This will be fixed in +a future release by adding singleton support. + +=head1 TRANSACTIONS + +As of 1.0000, DBM::Deep has ACID transactions. Every DBM::Deep object is completely +transaction-ready - it is not an option you have to turn on. You do have to +specify how many transactions may run simultaneously (q.v. L). + +Three new methods have been added to support them. They are: + +=over 4 + +=item * begin_work() + +This starts a transaction. + +=item * commit() + +This applies the changes done within the transaction to the mainline and ends +the transaction. + +=item * rollback() + +This discards the changes done within the transaction to the mainline and ends +the transaction. + +=back + +Transactions in DBM::Deep are done using a variant of the MVCC method, the +same method used by the InnoDB MySQL engine. + +=head1 MIGRATION + +As of 1.0000, the file format has changed. To aid in upgrades, a migration +script is provided within the CPAN distribution, called +F. + +B This script is not installed onto your system because it carries a copy +of every version prior to the current version. + +As of version 2.0000, databases created by old versions back to 1.0003 can +be read, but new features may not be available unless the database is +upgraded first. + +=head1 TODO + +The following are items that are planned to be added in future releases. These +are separate from the L below. + +=head2 Sub-Transactions + +Right now, you cannot run a transaction within a transaction. Removing this +restriction is technically straightforward, but the combinatorial explosion of +possible usecases hurts my head. If this is something you want to see +immediately, please submit many testcases. + +=head2 Caching + +If a client is willing to assert upon opening the file that this process will be +the only consumer of that datafile, then there are a number of caching +possibilities that can be taken advantage of. This does, however, mean that +DBM::Deep is more vulnerable to losing data due to unflushed changes. It also +means a much larger in-memory footprint. As such, it's not clear exactly how +this should be done. Suggestions are welcome. + +=head2 Ram-only + +The techniques used in DBM::Deep simply require a seekable contiguous +datastore. This could just as easily be a large string as a file. By using +substr, the STM capabilities of DBM::Deep could be used within a +single-process. I have no idea how I'd specify this, though. Suggestions are +welcome. + +=head2 Different contention resolution mechanisms + +Currently, the only contention resolution mechanism is last-write-wins. This +is the mechanism used by most RDBMSes and should be good enough for most uses. +For advanced uses of STM, other contention mechanisms will be needed. If you +have an idea of how you'd like to see contention resolution in DBM::Deep, +please let me know. + +=head1 CAVEATS, ISSUES & BUGS + +This section describes all the known issues with DBM::Deep. These are issues +that are either intractable or depend on some feature within Perl working +exactly right. It you have found something that is not listed below, please +send an e-mail to L. +Likewise, if you think you know of a way around one of these issues, please +let me know. + +=head2 References + +(The following assumes a high level of Perl understanding, specifically of +references. Most users can safely skip this section.) + +Currently, the only references supported are HASH and ARRAY. The other reference +types (SCALAR, CODE, GLOB, and REF) cannot be supported for various reasons. + +=over 4 + +=item * GLOB + +These are things like filehandles and other sockets. They can't be supported +because it's completely unclear how DBM::Deep should serialize them. + +=item * SCALAR / REF + +The discussion here refers to the following type of example: + + my $x = 25; + $db->{key1} = \$x; + + $x = 50; + + # In some other process ... + + my $val = ${ $db->{key1} }; + + is( $val, 50, "What actually gets stored in the DB file?" ); + +The problem is one of synchronization. When the variable being referred to +changes value, the reference isn't notified, which is kind of the point of +references. This means that the new value won't be stored in the datafile for +other processes to read. There is no TIEREF. + +It is theoretically possible to store references to values already within a +DBM::Deep object because everything already is synchronized, but the change to +the internals would be quite large. Specifically, DBM::Deep would have to tie +every single value that is stored. This would bloat the RAM footprint of +DBM::Deep at least twofold (if not more) and be a significant performance drain, +all to support a feature that has never been requested. + +=item * CODE + +L provides a mechanism for serializing coderefs, +including saving off all closure state. This would allow for DBM::Deep to +store the code for a subroutine. Then, whenever the subroutine is read, the +code could be C'ed into being. However, just as for SCALAR and REF, +that closure state may change without notifying the DBM::Deep object storing +the reference. Again, this would generally be considered a feature. + +=back + +=head2 External references and transactions + +If you do C<< my $x = $db->{foo}; >>, then start a transaction, $x will be +referencing the database from outside the transaction. A fix for this (and other +issues with how external references into the database) is being looked into. This +is the skipped set of tests in t/39_singletons.t and a related issue is the focus +of t/37_delete_edge_cases.t + +=head2 File corruption + +The current level of error handling in DBM::Deep is minimal. Files I checked +for a 32-bit signature when opened, but any other form of corruption in the +datafile can cause segmentation faults. DBM::Deep may try to C past +the end of a file, or get stuck in an infinite loop depending on the level and +type of corruption. File write operations are not checked for failure (for +speed), so if you happen to run out of disk space, DBM::Deep will probably fail in +a bad way. These things will be addressed in a later version of DBM::Deep. + +=head2 DB over NFS + +Beware of using DBM::Deep files over NFS. DBM::Deep uses flock(), which works +well on local filesystems, but will NOT protect you from file corruption over +NFS. I've heard about setting up your NFS server with a locking daemon, then +using C to lock your files, but your mileage may vary there as well. +From what I understand, there is no real way to do it. However, if you need +access to the underlying filehandle in DBM::Deep for using some other kind of +locking scheme like C, see the L section above. + +=head2 Copying Objects + +Beware of copying tied objects in Perl. Very strange things can happen. +Instead, use DBM::Deep's C method which safely copies the object and +returns a new, blessed and tied hash or array to the same level in the DB. + + my $copy = $db->clone(); + +B: Since clone() here is cloning the object, not the database location, +any modifications to either $db or $copy will be visible to both. + +=head2 Stale References + +If you take a reference to an array or hash from the database, it is tied +to the database itself. This means that if the datum in question is +subsequently deleted from the database, the reference to it will point to +an invalid location and unpredictable things will happen if you try to use +it. + +So a seemingly innocuous piece of code like this: + + my %hash = %{ $db->{some_hash} }; + +can fail if another process deletes or clobbers C<< $db->{some_hash} >> +while the data are being extracted, since S> is not atomic. +(This actually happened.) The solution is to lock the database before +reading the data: + + $db->lock_exclusive; + my %hash = %{ $db->{some_hash} }; + $db->unlock; + +As of version 1.0024, if you assign a stale reference to a location +in the database, DBM::Deep will warn, if you have uninitialized warnings +enabled, and treat the stale reference as C. An attempt to use a +stale reference as an array or hash reference will cause an error. + +=head2 Large Arrays + +Beware of using C, C or C with large arrays. +These functions cause every element in the array to move, which can be murder +on DBM::Deep, as every element has to be fetched from disk, then stored again in +a different location. This will be addressed in a future version. + +This has been somewhat addressed so that the cost is constant, regardless of +what is stored at those locations. So, small arrays with huge data structures in +them are faster. But, large arrays are still large. + +=head2 Writeonly Files + +If you pass in a filehandle to new(), you may have opened it in either a +readonly or writeonly mode. STORE will verify that the filehandle is writable. +However, there doesn't seem to be a good way to determine if a filehandle is +readable. And, if the filehandle isn't readable, it's not clear what will +happen. So, don't do that. + +=head2 Assignments Within Transactions + +The following will I work as one might expect: + + my $x = { a => 1 }; + + $db->begin_work; + $db->{foo} = $x; + $db->rollback; + + is( $x->{a}, 1 ); # This will fail! + +The problem is that the moment a reference used as the rvalue to a DBM::Deep +object's lvalue, it becomes tied itself. This is so that future changes to +C<$x> can be tracked within the DBM::Deep file and is considered to be a +feature. By the time the rollback occurs, there is no knowledge that there had +been an C<$x> or what memory location to assign an C to. + +B This does not affect importing because imports do a walk over the +reference to be imported in order to explicitly leave it untied. + +=head1 CODE COVERAGE + +L is used to test the code coverage of the tests. Below is the +L report on this distribution's test suite. + + ---------------------------- ------ ------ ------ ------ ------ ------ ------ + File stmt bran cond sub pod time total + ---------------------------- ------ ------ ------ ------ ------ ------ ------ + blib/lib/DBM/Deep.pm 100.0 89.1 82.9 100.0 100.0 32.5 98.1 + blib/lib/DBM/Deep/Array.pm 100.0 94.4 100.0 100.0 100.0 5.2 98.8 + blib/lib/DBM/Deep/Engine.pm 100.0 92.9 100.0 100.0 100.0 7.4 100.0 + ...ib/DBM/Deep/Engine/DBI.pm 95.0 73.1 100.0 100.0 100.0 1.5 90.4 + ...b/DBM/Deep/Engine/File.pm 92.3 78.5 88.9 100.0 100.0 4.9 90.3 + blib/lib/DBM/Deep/Hash.pm 100.0 100.0 100.0 100.0 100.0 3.8 100.0 + .../lib/DBM/Deep/Iterator.pm 100.0 n/a n/a 100.0 100.0 0.0 100.0 + .../DBM/Deep/Iterator/DBI.pm 100.0 100.0 n/a 100.0 100.0 1.2 100.0 + ...DBM/Deep/Iterator/File.pm 92.5 84.6 n/a 100.0 66.7 0.6 90.0 + ...erator/File/BucketList.pm 100.0 75.0 n/a 100.0 66.7 0.4 93.8 + ...ep/Iterator/File/Index.pm 100.0 100.0 n/a 100.0 100.0 0.2 100.0 + blib/lib/DBM/Deep/Null.pm 87.5 n/a n/a 75.0 n/a 0.0 83.3 + blib/lib/DBM/Deep/Sector.pm 91.7 n/a n/a 83.3 0.0 6.7 74.4 + ...ib/DBM/Deep/Sector/DBI.pm 96.8 83.3 n/a 100.0 0.0 1.0 89.8 + ...p/Sector/DBI/Reference.pm 100.0 95.5 100.0 100.0 0.0 2.2 91.2 + ...Deep/Sector/DBI/Scalar.pm 100.0 100.0 n/a 100.0 0.0 1.1 92.9 + ...b/DBM/Deep/Sector/File.pm 96.0 87.5 100.0 92.3 25.0 2.2 91.0 + ...Sector/File/BucketList.pm 98.2 85.7 83.3 100.0 0.0 3.3 89.4 + .../Deep/Sector/File/Data.pm 100.0 n/a n/a 100.0 0.0 0.1 90.9 + ...Deep/Sector/File/Index.pm 100.0 80.0 33.3 100.0 0.0 0.8 83.1 + .../Deep/Sector/File/Null.pm 100.0 100.0 n/a 100.0 0.0 0.0 91.7 + .../Sector/File/Reference.pm 100.0 90.0 80.0 100.0 0.0 1.4 91.5 + ...eep/Sector/File/Scalar.pm 98.4 87.5 n/a 100.0 0.0 0.8 91.9 + blib/lib/DBM/Deep/Storage.pm 100.0 n/a n/a 100.0 100.0 0.0 100.0 + ...b/DBM/Deep/Storage/DBI.pm 97.3 70.8 n/a 100.0 38.5 6.7 87.0 + .../DBM/Deep/Storage/File.pm 96.6 77.1 80.0 95.7 100.0 16.0 91.8 + Total 99.3 85.2 84.9 99.8 63.3 100.0 97.6 + ---------------------------- ------ ------ ------ ------ ------ ------ ------ + +=head1 MORE INFORMATION + +Check out the DBM::Deep Google Group at L +or send email to L. +You can also visit #dbm-deep on irc.perl.org + +The source code repository is at L + +=head1 MAINTAINERS + +Rob Kinyon, L + +Originally written by Joseph Huckaby, L + +=head1 SPONSORS + +Stonehenge Consulting (L) sponsored the +development of transactions and freespace management, leading to the 1.0000 +release. A great debt of gratitude goes out to them for their continuing +leadership in and support of the Perl community. + +=head1 CONTRIBUTORS + +The following have contributed greatly to make DBM::Deep what it is today: + +=over 4 + +=item * Adam Sah and Rich Gaushell for innumerable contributions early on. + +=item * Dan Golden and others at YAPC::NA 2006 for helping me design through transactions. + +=item * James Stanley for bug fix + +=item * David Steinbrunner for fixing typos and adding repository cpan metadata + +=item * H. Merijn Brandt for fixing the POD escapes. + +=item * Breno G. de Oliveira for minor packaging tweaks + +=back + +=head1 SEE ALSO + +L + +L, L, L, L, L, +L + +=head1 LICENSE + +Copyright (c) 2007-14 Rob Kinyon. All Rights Reserved. +This is free software, you may use it and distribute it under the same terms +as Perl itself. + +=cut diff --git a/src/modules/DBM/Deep/Array.pm b/src/modules/DBM/Deep/Array.pm new file mode 100644 index 0000000..a3291bd --- /dev/null +++ b/src/modules/DBM/Deep/Array.pm @@ -0,0 +1,427 @@ +package DBM::Deep::Array; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; +no warnings 'recursion'; + +# This is to allow DBM::Deep::Array to handle negative indices on +# its own. Otherwise, Perl would intercept the call to negative +# indices for us. This was causing bugs for negative index handling. +our $NEGATIVE_INDICES = 1; + +use base 'DBM::Deep'; + +use Scalar::Util (); + +sub _get_self { + # We used to have + # eval { local $SIG{'__DIE__'}; tied( @{$_[0]} ) } || $_[0] + # but this does not always work during global destruction (DBM::Deep’s + # destructor calls this method), but will return $_[0] even when $_[0] + # is tied, if it’s tied to undef. In those cases it’s better to return + # undef, so the destructor can tell not to do anything, and, if any- + # thing else calls us, it will fail with a more helpful error message. + + Scalar::Util::reftype $_[0] eq 'ARRAY' ? tied @{$_[0]} : $_[0]; +} + +sub _repr { [] } + +sub TIEARRAY { + my $class = shift; + my $args = $class->_get_args( @_ ); + + $args->{type} = $class->TYPE_ARRAY; + + return $class->_init($args); +} + +sub FETCH { + my $self = shift->_get_self; + my ($key) = @_; + + $self->lock_shared; + + if ( !defined $key ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use an undefined array index." ); + } + elsif ( $key =~ /^-?\d+$/ ) { + if ( $key < 0 ) { + $key += $self->FETCHSIZE; + unless ( $key >= 0 ) { + $self->unlock; + return; + } + } + } + elsif ( $key ne 'length' ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use '$key' as an array index." ); + } + + my $rv = $self->SUPER::FETCH( $key ); + + $self->unlock; + + return $rv; +} + +sub STORE { + my $self = shift->_get_self; + my ($key, $value) = @_; + + $self->lock_exclusive; + + my $size; + my $idx_is_numeric; + if ( !defined $key ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use an undefined array index." ); + } + elsif ( $key =~ /^-?\d+$/ ) { + $idx_is_numeric = 1; + if ( $key < 0 ) { + $size = $self->FETCHSIZE; + if ( $key + $size < 0 ) { + die( "Modification of non-creatable array value attempted, subscript $key" ); + } + $key += $size + } + } + elsif ( $key ne 'length' ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use '$key' as an array index." ); + } + + my $rv = $self->SUPER::STORE( $key, $value ); + + if ( $idx_is_numeric ) { + $size = $self->FETCHSIZE unless defined $size; + if ( $key >= $size ) { + $self->STORESIZE( $key + 1 ); + } + } + + $self->unlock; + + return $rv; +} + +sub EXISTS { + my $self = shift->_get_self; + my ($key) = @_; + + $self->lock_shared; + + if ( !defined $key ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use an undefined array index." ); + } + elsif ( $key =~ /^-?\d+$/ ) { + if ( $key < 0 ) { + $key += $self->FETCHSIZE; + unless ( $key >= 0 ) { + $self->unlock; + return; + } + } + } + elsif ( $key ne 'length' ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use '$key' as an array index." ); + } + + my $rv = $self->SUPER::EXISTS( $key ); + + $self->unlock; + + return $rv; +} + +sub DELETE { + my $self = shift->_get_self; + my ($key) = @_; + warn "ARRAY::DELETE($self,$key)\n" if DBM::Deep::DEBUG; + + $self->lock_exclusive; + + my $size = $self->FETCHSIZE; + if ( !defined $key ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use an undefined array index." ); + } + elsif ( $key =~ /^-?\d+$/ ) { + if ( $key < 0 ) { + $key += $size; + unless ( $key >= 0 ) { + $self->unlock; + return; + } + } + } + elsif ( $key ne 'length' ) { + $self->unlock; + DBM::Deep->_throw_error( "Cannot use '$key' as an array index." ); + } + + my $rv = $self->SUPER::DELETE( $key ); + + if ($rv && $key == $size - 1) { + $self->STORESIZE( $key ); + } + + $self->unlock; + + return $rv; +} + +# Now that we have a real Reference sector, we should store arrayzize there. +# However, arraysize needs to be transactionally-aware, so a simple location to +# store it isn't going to work. +sub FETCHSIZE { + my $self = shift->_get_self; + + $self->lock_shared; + + my $SAVE_FILTER = $self->_engine->storage->{filter_fetch_value}; + $self->_engine->storage->{filter_fetch_value} = undef; + + my $size = $self->FETCH('length') || 0; + + $self->_engine->storage->{filter_fetch_value} = $SAVE_FILTER; + + $self->unlock; + + return $size; +} + +sub STORESIZE { + my $self = shift->_get_self; + my ($new_length) = @_; + + $self->lock_exclusive; + + my $SAVE_FILTER = $self->_engine->storage->{filter_store_value}; + $self->_engine->storage->{filter_store_value} = undef; + + my $result = $self->STORE('length', $new_length, 'length'); + + $self->_engine->storage->{filter_store_value} = $SAVE_FILTER; + + $self->unlock; + + return $result; +} + +sub POP { + my $self = shift->_get_self; + + $self->lock_exclusive; + + my $length = $self->FETCHSIZE(); + + if ($length) { + my $content = $self->FETCH( $length - 1 ); + $self->DELETE( $length - 1 ); + + $self->unlock; + + return $content; + } + else { + $self->unlock; + return; + } +} + +sub PUSH { + my $self = shift->_get_self; + + $self->lock_exclusive; + + my $length = $self->FETCHSIZE(); + + for my $content (@_) { + $self->STORE( $length, $content ); + $length++; + } + + $self->unlock; + + return $length; +} + +# XXX This really needs to be something more direct within the file, not a +# fetch and re-store. -RobK, 2007-09-20 +sub _move_value { + my $self = shift; + my ($old_key, $new_key) = @_; + + return $self->_engine->make_reference( $self, $old_key, $new_key ); +} + +sub SHIFT { + my $self = shift->_get_self; + warn "SHIFT($self)\n" if DBM::Deep::DEBUG; + + $self->lock_exclusive; + + my $length = $self->FETCHSIZE(); + + if ( !$length ) { + $self->unlock; + return; + } + + my $content = $self->DELETE( 0 ); + + # Unless the deletion above has cleared the array ... + if ( $length > 1 ) { + for (my $i = 0; $i < $length - 1; $i++) { + $self->_move_value( $i+1, $i ); + } + + $self->DELETE( $length - 1 ); + } + + $self->unlock; + + return $content; +} + +sub UNSHIFT { + my $self = shift->_get_self; + my @new_elements = @_; + + $self->lock_exclusive; + + my $length = $self->FETCHSIZE(); + my $new_size = scalar @new_elements; + + if ($length) { + for (my $i = $length - 1; $i >= 0; $i--) { + $self->_move_value( $i, $i+$new_size ); + } + + $self->STORESIZE( $length + $new_size ); + } + + for (my $i = 0; $i < $new_size; $i++) { + $self->STORE( $i, $new_elements[$i] ); + } + + $self->unlock; + + return $length + $new_size; +} + +sub SPLICE { + my $self = shift->_get_self; + + $self->lock_exclusive; + + my $length = $self->FETCHSIZE(); + + ## + # Calculate offset and length of splice + ## + my $offset = shift; + $offset = 0 unless defined $offset; + if ($offset < 0) { $offset += $length; } + + my $splice_length; + if (scalar @_) { $splice_length = shift; } + else { $splice_length = $length - $offset; } + if ($splice_length < 0) { $splice_length += ($length - $offset); } + + ## + # Setup array with new elements, and copy out old elements for return + ## + my @new_elements = @_; + my $new_size = scalar @new_elements; + + my @old_elements = map { + $self->FETCH( $_ ) + } $offset .. ($offset + $splice_length - 1); + + ## + # Adjust array length, and shift elements to accommodate new section. + ## + if ( $new_size != $splice_length ) { + if ($new_size > $splice_length) { + for (my $i = $length - 1; $i >= $offset + $splice_length; $i--) { + $self->_move_value( $i, $i + ($new_size - $splice_length) ); + } + $self->STORESIZE( $length + $new_size - $splice_length ); + } + else { + for (my $i = $offset + $splice_length; $i < $length; $i++) { + $self->_move_value( $i, $i + ($new_size - $splice_length) ); + } + for (my $i = 0; $i < $splice_length - $new_size; $i++) { + $self->DELETE( $length - 1 ); + $length--; + } + } + } + + ## + # Insert new elements into array + ## + for (my $i = $offset; $i < $offset + $new_size; $i++) { + $self->STORE( $i, shift @new_elements ); + } + + $self->unlock; + + ## + # Return deleted section, or last element in scalar context. + ## + return wantarray ? @old_elements : $old_elements[-1]; +} + +# We don't need to populate it, yet. +# It will be useful, though, when we split out HASH and ARRAY +# Perl will call EXTEND() when the array is likely to grow. +# We don't care, but include it because it gets called at times. +sub EXTEND {} + +sub _copy_node { + my $self = shift; + my ($db_temp) = @_; + + my $length = $self->length(); + for (my $index = 0; $index < $length; $index++) { + $self->_copy_value( \$db_temp->[$index], $self->get($index) ); + } + + return 1; +} + +sub _clear { + my $self = shift; + + my $size = $self->FETCHSIZE; + for my $key ( 0 .. $size - 1 ) { + $self->_engine->delete_key( $self, $key, $key ); + } + $self->STORESIZE( 0 ); + + return; +} + +sub length { (shift)->FETCHSIZE(@_) } +sub pop { (shift)->POP(@_) } +sub push { (shift)->PUSH(@_) } +sub unshift { (shift)->UNSHIFT(@_) } +sub splice { (shift)->SPLICE(@_) } + +# This must be last otherwise we have to qualify all other calls to shift +# as calls to CORE::shift +sub shift { (CORE::shift)->SHIFT(@_) } + +1; +__END__ diff --git a/src/modules/DBM/Deep/Cookbook.pod b/src/modules/DBM/Deep/Cookbook.pod new file mode 100644 index 0000000..2617609 --- /dev/null +++ b/src/modules/DBM/Deep/Cookbook.pod @@ -0,0 +1,215 @@ +=head1 NAME + +DBM::Deep::Cookbook - Cookbook for DBM::Deep + +=head1 DESCRIPTION + +This is the Cookbook for L. It contains useful tips and tricks, +plus some examples of how to do common tasks. + +=head1 RECIPES + +=head2 Unicode data + +If possible, it is highly recommended that you upgrade your database to +version 2 (using the F script in the CPAN +distribution), in order to use Unicode. + +If your databases are still shared by perl installations with older +DBM::Deep versions, you can use filters to encode strings on the fly: + + my $db = DBM::Deep->new( ... ); + my $encode_sub = sub { my $s = shift; utf8::encode($s); $s }; + my $decode_sub = sub { my $s = shift; utf8::decode($s); $s }; + $db->set_filter( 'store_value' => $encode_sub ); + $db->set_filter( 'fetch_value' => $decode_sub ); + $db->set_filter( 'store_key' => $encode_sub ); + $db->set_filter( 'fetch_key' => $decode_sub ); + +A previous version of this cookbook recommended using +C_fh, ":utf8">, but that is I a good idea, as it +could easily corrupt the database. + +=head2 Real-time Encryption Example + +B: This is just an example of how to write a filter. This most +definitely should B be taken as a proper way to write a filter that does +encryption. (Furthermore, it fails to take Unicode into account.) + +Here is a working example that uses the I module to +do real-time encryption / decryption of keys & values with DBM::Deep Filters. +Please visit L for more +on I. You'll also need the I module. + + use DBM::Deep; + use Crypt::Blowfish; + use Crypt::CBC; + + my $cipher = Crypt::CBC->new({ + 'key' => 'my secret key', + 'cipher' => 'Blowfish', + 'iv' => '$KJh#(}q', + 'regenerate_key' => 0, + 'padding' => 'space', + 'prepend_iv' => 0 + }); + + my $db = DBM::Deep->new( + file => "foo-encrypt.db", + filter_store_key => \&my_encrypt, + filter_store_value => \&my_encrypt, + filter_fetch_key => \&my_decrypt, + filter_fetch_value => \&my_decrypt, + ); + + $db->{key1} = "value1"; + $db->{key2} = "value2"; + print "key1: " . $db->{key1} . "\n"; + print "key2: " . $db->{key2} . "\n"; + + undef $db; + exit; + + sub my_encrypt { + return $cipher->encrypt( $_[0] ); + } + sub my_decrypt { + return $cipher->decrypt( $_[0] ); + } + +=head2 Real-time Compression Example + +Here is a working example that uses the I module to do real-time +compression / decompression of keys & values with DBM::Deep Filters. +Please visit L for +more on I. + + use DBM::Deep; + use Compress::Zlib; + + my $db = DBM::Deep->new( + file => "foo-compress.db", + filter_store_key => \&my_compress, + filter_store_value => \&my_compress, + filter_fetch_key => \&my_decompress, + filter_fetch_value => \&my_decompress, + ); + + $db->{key1} = "value1"; + $db->{key2} = "value2"; + print "key1: " . $db->{key1} . "\n"; + print "key2: " . $db->{key2} . "\n"; + + undef $db; + exit; + + sub my_compress { + my $s = shift; + utf8::encode($s); + return Compress::Zlib::memGzip( $s ) ; + } + sub my_decompress { + my $s = Compress::Zlib::memGunzip( shift ) ; + utf8::decode($s); + return $s; + } + +B Filtering of keys only applies to hashes. Array "keys" are +actually numerical index numbers, and are not filtered. + +=head1 Custom Digest Algorithm + +DBM::Deep by default uses the I (MD5) algorithm for hashing +keys. However you can override this, and use another algorithm (such as SHA-256) +or even write your own. But please note that DBM::Deep currently expects zero +collisions, so your algorithm has to be I, so to speak. Collision +detection may be introduced in a later version. + +You can specify a custom digest algorithm by passing it into the parameter +list for new(), passing a reference to a subroutine as the 'digest' parameter, +and the length of the algorithm's hashes (in bytes) as the 'hash_size' +parameter. Here is a working example that uses a 256-bit hash from the +I module. Please see +L for more information. + +The value passed to your digest function will be encoded as UTF-8 if the +database is in version 2 format or higher. + + use DBM::Deep; + use Digest::SHA256; + + my $context = Digest::SHA256::new(256); + + my $db = DBM::Deep->new( + filename => "foo-sha.db", + digest => \&my_digest, + hash_size => 32, + ); + + $db->{key1} = "value1"; + $db->{key2} = "value2"; + print "key1: " . $db->{key1} . "\n"; + print "key2: " . $db->{key2} . "\n"; + + undef $db; + exit; + + sub my_digest { + return substr( $context->hash($_[0]), 0, 32 ); + } + +B Your returned digest strings must be B the number +of bytes you specify in the hash_size parameter (in this case 32). Undefined +behavior will occur otherwise. + +B If you do choose to use a custom digest algorithm, you must set it +every time you access this file. Otherwise, the default (MD5) will be used. + +=head1 PERFORMANCE + +Because DBM::Deep is a conncurrent datastore, every change is flushed to disk +immediately and every read goes to disk. This means that DBM::Deep functions +at the speed of disk (generally 10-20ms) vs. the speed of RAM (generally +50-70ns), or at least 150-200x slower than the comparable in-memory +datastructure in Perl. + +There are several techniques you can use to speed up how DBM::Deep functions. + +=over 4 + +=item * Put it on a ramdisk + +The easiest and quickest mechanism to making DBM::Deep run faster is to create +a ramdisk and locate the DBM::Deep file there. Doing this as an option may +become a feature of DBM::Deep, assuming there is a good ramdisk wrapper on CPAN. + +=item * Work at the tightest level possible + +It is much faster to assign the level of your db that you are working with to +an intermediate variable than to re-look it up every time. Thus + + # BAD + while ( my ($k, $v) = each %{$db->{foo}{bar}{baz}} ) { + ... + } + + # GOOD + my $x = $db->{foo}{bar}{baz}; + while ( my ($k, $v) = each %$x ) { + ... + } + +=item * Make your file as tight as possible + +If you know that you are not going to use more than 65K in your database, +consider using the C 'small'> option. This will instruct +DBM::Deep to use 16bit addresses, meaning that the seek times will be less. + +=back + +=head1 SEE ALSO + +L, L, L, +L, L + +=cut diff --git a/src/modules/DBM/Deep/Engine.pm b/src/modules/DBM/Deep/Engine.pm new file mode 100644 index 0000000..4b9cf91 --- /dev/null +++ b/src/modules/DBM/Deep/Engine.pm @@ -0,0 +1,442 @@ +package DBM::Deep::Engine; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; +no warnings 'recursion'; + +use DBM::Deep::Iterator (); + +# File-wide notes: +# * Every method in here assumes that the storage has been appropriately +# safeguarded. This can be anything from flock() to some sort of manual +# mutex. But, it's the caller's responsibility to make sure that this has +# been done. + +sub SIG_HASH () { 'H' } +sub SIG_ARRAY () { 'A' } + +=head1 NAME + +DBM::Deep::Engine - mediate mapping between DBM::Deep objects and storage medium + +=head1 PURPOSE + +This is an internal-use-only object for L. It mediates the low-level +mapping between the L objects and the storage medium. + +The purpose of this documentation is to provide low-level documentation for +developers. It is B intended to be used by the general public. This +documentation and what it documents can and will change without notice. + +=head1 OVERVIEW + +The engine exposes an API to the DBM::Deep objects (DBM::Deep, DBM::Deep::Array, +and DBM::Deep::Hash) for their use to access the actual stored values. This API +is the following: + +=over 4 + +=item * new + +=item * read_value + +=item * get_classname + +=item * make_reference + +=item * key_exists + +=item * delete_key + +=item * write_value + +=item * get_next_key + +=item * setup + +=item * clear + +=item * begin_work + +=item * commit + +=item * rollback + +=item * lock_exclusive + +=item * lock_shared + +=item * unlock + +=back + +They are explained in their own sections below. These methods, in turn, may +provide some bounds-checking, but primarily act to instantiate objects in the +Engine::Sector::* hierarchy and dispatch to them. + +=head1 TRANSACTIONS + +Transactions in DBM::Deep are implemented using a variant of MVCC. This attempts +to keep the amount of actual work done against the file low while still providing +Atomicity, Consistency, and Isolation. Durability, unfortunately, cannot be done +with only one file. + +=head2 STALENESS + +If another process uses a transaction slot and writes stuff to it, then +terminates, the data that process wrote is still within the file. In order to +address this, there is also a transaction staleness counter associated within +every write. Each time a transaction is started, that process increments that +transaction's staleness counter. If, when it reads a value, the staleness +counters aren't identical, DBM::Deep will consider the value on disk to be stale +and discard it. + +=head2 DURABILITY + +The fourth leg of ACID is Durability, the guarantee that when a commit returns, +the data will be there the next time you read from it. This should be regardless +of any crashes or powerdowns in between the commit and subsequent read. +DBM::Deep does provide that guarantee; once the commit returns, all of the data +has been transferred from the transaction shadow to the HEAD. The issue arises +with partial commits - a commit that is interrupted in some fashion. In keeping +with DBM::Deep's "tradition" of very light error-checking and non-existent +error-handling, there is no way to recover from a partial commit. (This is +probably a failure in Consistency as well as Durability.) + +Other DBMSes use transaction logs (a separate file, generally) to achieve +Durability. As DBM::Deep is a single-file, we would have to do something +similar to what SQLite and BDB do in terms of committing using synchronized +writes. To do this, we would have to use a much higher RAM footprint and some +serious programming that makes my head hurt just to think about it. + +=cut + +=head1 METHODS + +=head2 read_value( $obj, $key ) + +This takes an object that provides _base_offset() and a string. It returns the +value stored in the corresponding Sector::Value's data section. + +=cut + +sub read_value { die "read_value must be implemented in a child class" } + +=head2 get_classname( $obj ) + +This takes an object that provides _base_offset() and returns the classname (if +any) associated with it. + +It delegates to Sector::Reference::get_classname() for the heavy lifting. + +It performs a staleness check. + +=cut + +sub get_classname { die "get_classname must be implemented in a child class" } + +=head2 make_reference( $obj, $old_key, $new_key ) + +This takes an object that provides _base_offset() and two strings. The +strings correspond to the old key and new key, respectively. This operation +is equivalent to (given C<< $db->{foo} = []; >>) C<< $db->{bar} = $db->{foo} >>. + +This returns nothing. + +=cut + +sub make_reference { die "make_reference must be implemented in a child class" } + +=head2 key_exists( $obj, $key ) + +This takes an object that provides _base_offset() and a string for +the key to be checked. This returns 1 for true and "" for false. + +=cut + +sub key_exists { die "key_exists must be implemented in a child class" } + +=head2 delete_key( $obj, $key ) + +This takes an object that provides _base_offset() and a string for +the key to be deleted. This returns the result of the Sector::Reference +delete_key() method. + +=cut + +sub delete_key { die "delete_key must be implemented in a child class" } + +=head2 write_value( $obj, $key, $value ) + +This takes an object that provides _base_offset(), a string for the +key, and a value. This value can be anything storable within L. + +This returns 1 upon success. + +=cut + +sub write_value { die "write_value must be implemented in a child class" } + +=head2 setup( $obj ) + +This takes an object that provides _base_offset(). It will do everything needed +in order to properly initialize all values for necessary functioning. If this is +called upon an already initialized object, this will also reset the inode. + +This returns 1. + +=cut + +sub setup { die "setup must be implemented in a child class" } + +=head2 begin_work( $obj ) + +This takes an object that provides _base_offset(). It will set up all necessary +bookkeeping in order to run all work within a transaction. + +If $obj is already within a transaction, an error will be thrown. If there are +no more available transactions, an error will be thrown. + +This returns undef. + +=cut + +sub begin_work { die "begin_work must be implemented in a child class" } + +=head2 rollback( $obj ) + +This takes an object that provides _base_offset(). It will revert all +actions taken within the running transaction. + +If $obj is not within a transaction, an error will be thrown. + +This returns 1. + +=cut + +sub rollback { die "rollback must be implemented in a child class" } + +=head2 commit( $obj ) + +This takes an object that provides _base_offset(). It will apply all +actions taken within the transaction to the HEAD. + +If $obj is not within a transaction, an error will be thrown. + +This returns 1. + +=cut + +sub commit { die "commit must be implemented in a child class" } + +=head2 get_next_key( $obj, $prev_key ) + +This takes an object that provides _base_offset() and an optional string +representing the prior key returned via a prior invocation of this method. + +This method delegates to C<< DBM::Deep::Iterator->get_next_key() >>. + +=cut + +# XXX Add staleness here +sub get_next_key { + my $self = shift; + my ($obj, $prev_key) = @_; + + # XXX Need to add logic about resetting the iterator if any key in the + # reference has changed + unless ( defined $prev_key ) { + eval "use " . $self->iterator_class; die $@ if $@; + $obj->{iterator} = $self->iterator_class->new({ + base_offset => $obj->_base_offset, + engine => $self, + }); + } + + return $obj->{iterator}->get_next_key( $obj ); +} + +=head2 lock_exclusive() + +This takes an object that provides _base_offset(). It will guarantee that +the storage has taken precautions to be safe for a write. + +This returns nothing. + +=cut + +sub lock_exclusive { + my $self = shift; + my ($obj) = @_; + return $self->storage->lock_exclusive( $obj ); +} + +=head2 lock_shared() + +This takes an object that provides _base_offset(). It will guarantee that +the storage has taken precautions to be safe for a read. + +This returns nothing. + +=cut + +sub lock_shared { + my $self = shift; + my ($obj) = @_; + return $self->storage->lock_shared( $obj ); +} + +=head2 unlock() + +This takes an object that provides _base_offset(). It will guarantee that +the storage has released the most recently-taken lock. + +This returns nothing. + +=cut + +sub unlock { + my $self = shift; + my ($obj) = @_; + + my $rv = $self->storage->unlock( $obj ); + + $self->flush if $rv; + + return $rv; +} + +=head1 INTERNAL METHODS + +The following methods are internal-use-only to DBM::Deep::Engine and its +child classes. + +=cut + +=head2 flush() + +This takes no arguments. It will do everything necessary to flush all things to +disk. This is usually called during unlock() and setup(). + +This returns nothing. + +=cut + +sub flush { + my $self = shift; + + # Why do we need to have the storage flush? Shouldn't autoflush take care of + # things? -RobK, 2008-06-26 + $self->storage->flush; + + return; +} + +=head2 load_sector( $loc ) + +This takes an id/location/offset and loads the sector based on the engine's +defined sector type. + +=cut + +sub load_sector { $_[0]->sector_type->load( @_ ) } + +=head2 clear( $obj ) + +This takes an object that provides _base_offset() and deletes all its +elements, returning nothing. + +=cut + +sub clear { die "clear must be implemented in a child class" } + +=head2 cache / clear_cache + +This is the cache of loaded Reference sectors. + +=cut + +sub cache { $_[0]{cache} ||= {} } +sub clear_cache { %{$_[0]->cache} = () } + +=head2 supports( $option ) + +This returns a boolean depending on if this instance of DBM::Dep supports +that feature. C<$option> can be one of: + +=over 4 + +=item * transactions + +=item * singletons + +=back + +Any other value will return false. + +=cut + +sub supports { die "supports must be implemented in a child class" } + +=head1 ACCESSORS + +The following are readonly attributes. + +=over 4 + +=item * storage + +=item * sector_type + +=item * iterator_class + +=back + +=cut + +sub storage { $_[0]{storage} } + +sub sector_type { die "sector_type must be implemented in a child class" } +sub iterator_class { die "iterator_class must be implemented in a child class" } + +# This code is to make sure we write all the values in the $value to the +# disk and to make sure all changes to $value after the assignment are +# reflected on disk. This may be counter-intuitive at first, but it is +# correct dwimmery. +# NOTE - simply tying $value won't perform a STORE on each value. Hence, +# the copy to a temp value. +sub _descend { + my $self = shift; + my ($value, $value_sector) = @_; + my $r = Scalar::Util::reftype( $value ) || ''; + + if ( $r eq 'ARRAY' ) { + my @temp = @$value; + tie @$value, 'DBM::Deep', { + base_offset => $value_sector->offset, + staleness => $value_sector->staleness, + storage => $self->storage, + engine => $self, + }; + @$value = @temp; + bless $value, 'DBM::Deep::Array' unless Scalar::Util::blessed( $value ); + } + elsif ( $r eq 'HASH' ) { + my %temp = %$value; + tie %$value, 'DBM::Deep', { + base_offset => $value_sector->offset, + staleness => $value_sector->staleness, + storage => $self->storage, + engine => $self, + }; + %$value = %temp; + bless $value, 'DBM::Deep::Hash' unless Scalar::Util::blessed( $value ); + } + + return; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Engine/DBI.pm b/src/modules/DBM/Deep/Engine/DBI.pm new file mode 100644 index 0000000..28b7daf --- /dev/null +++ b/src/modules/DBM/Deep/Engine/DBI.pm @@ -0,0 +1,367 @@ +package DBM::Deep::Engine::DBI; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; +no warnings 'recursion'; + +use base 'DBM::Deep::Engine'; + +use DBM::Deep::Sector::DBI (); +use DBM::Deep::Storage::DBI (); + +sub sector_type { 'DBM::Deep::Sector::DBI' } +sub iterator_class { 'DBM::Deep::Iterator::DBI' } + +sub new { + my $class = shift; + my ($args) = @_; + + $args->{storage} = DBM::Deep::Storage::DBI->new( $args ) + unless exists $args->{storage}; + + my $self = bless { + storage => undef, + external_refs => undef, + }, $class; + + # Grab the parameters we want to use + foreach my $param ( keys %$self ) { + next unless exists $args->{$param}; + $self->{$param} = $args->{$param}; + } + + return $self; +} + +sub setup { + my $self = shift; + my ($obj) = @_; + + # Default the id to 1. This means that we will be creating a row if there + # isn't one. The assumption is that the row_id=1 cannot never be deleted. I + # don't know if this is a good assumption. + $obj->{base_offset} ||= 1; + + my ($rows) = $self->storage->read_from( + refs => $obj->_base_offset, + qw( ref_type ), + ); + + # We don't have a row yet. + unless ( @$rows ) { + $self->storage->write_to( + refs => $obj->_base_offset, + ref_type => $obj->_type, + ); + } + + my $sector = DBM::Deep::Sector::DBI::Reference->new({ + engine => $self, + offset => $obj->_base_offset, + }); +} + +sub read_value { + my $self = shift; + my ($obj, $key) = @_; + + my $sector = $self->load_sector( $obj->_base_offset, 'refs' ) + or return; + +# if ( $sector->staleness != $obj->_staleness ) { +# return; +# } + +# my $key_md5 = $self->_apply_digest( $key ); + + my $value_sector = $sector->get_data_for({ + key => $key, +# key_md5 => $key_md5, + allow_head => 1, + }); + + unless ( $value_sector ) { + return undef + } + + return $value_sector->data; +} + +sub get_classname { + my $self = shift; + my ($obj) = @_; + + my $sector = $self->load_sector( $obj->_base_offset, 'refs' ) + or return; + + return $sector->get_classname; +} + +sub make_reference { + my $self = shift; + my ($obj, $old_key, $new_key) = @_; + + my $sector = $self->load_sector( $obj->_base_offset, 'refs' ) + or return; + +# if ( $sector->staleness != $obj->_staleness ) { +# return; +# } + + my $value_sector = $sector->get_data_for({ + key => $old_key, + allow_head => 1, + }); + + unless ( $value_sector ) { + $value_sector = DBM::Deep::Sector::DBI::Scalar->new({ + engine => $self, + data => undef, + }); + + $sector->write_data({ + key => $old_key, + value => $value_sector, + }); + } + + if ( $value_sector->isa( 'DBM::Deep::Sector::DBI::Reference' ) ) { + $sector->write_data({ + key => $new_key, + value => $value_sector, + }); + $value_sector->increment_refcount; + } + else { + $sector->write_data({ + key => $new_key, + value => $value_sector->clone, + }); + } + + return; +} + +# exists returns '', not undefined. +sub key_exists { + my $self = shift; + my ($obj, $key) = @_; + + my $sector = $self->load_sector( $obj->_base_offset, 'refs' ) + or return ''; + +# if ( $sector->staleness != $obj->_staleness ) { +# return ''; +# } + + my $data = $sector->get_data_for({ +# key_md5 => $self->_apply_digest( $key ), + key => $key, + allow_head => 1, + }); + + # exists() returns 1 or '' for true/false. + return $data ? 1 : ''; +} + +sub delete_key { + my $self = shift; + my ($obj, $key) = @_; + + my $sector = $self->load_sector( $obj->_base_offset, 'refs' ) + or return ''; + +# if ( $sector->staleness != $obj->_staleness ) { +# return ''; +# } + + return $sector->delete_key({ +# key_md5 => $self->_apply_digest( $key ), + key => $key, + allow_head => 0, + }); +} + +sub write_value { + my $self = shift; + my ($obj, $key, $value) = @_; + + my $r = Scalar::Util::reftype( $value ) || ''; + { + last if $r eq ''; + last if $r eq 'HASH'; + last if $r eq 'ARRAY'; + + DBM::Deep->_throw_error( + "Storage of references of type '$r' is not supported." + ); + } + + # Load the reference entry + # Determine if the row was deleted under us + # + + my $sector = $self->load_sector( $obj->_base_offset, 'refs' ) + or die "Cannot load sector at '@{[$obj->_base_offset]}'\n";; + + my ($type, $class); + if ( + $r eq 'ARRAY' || $r eq 'HASH' and ref $value ne 'DBM::Deep::Null' + ) { + my $tmpvar; + if ( $r eq 'ARRAY' ) { + $tmpvar = tied @$value; + } elsif ( $r eq 'HASH' ) { + $tmpvar = tied %$value; + } + + if ( $tmpvar ) { + my $is_dbm_deep = eval { local $SIG{'__DIE__'}; $tmpvar->isa( 'DBM::Deep' ); }; + + unless ( $is_dbm_deep ) { + DBM::Deep->_throw_error( "Cannot store something that is tied." ); + } + + unless ( $tmpvar->_engine->storage == $self->storage ) { + DBM::Deep->_throw_error( "Cannot store values across DBM::Deep files. Please use export() instead." ); + } + + # Load $tmpvar's sector + + # First, verify if we're storing the same thing to this spot. If we + # are, then this should be a no-op. -EJS, 2008-05-19 + + # See whether or not we are storing ourselves to ourself. + # Write the sector as data in this reference (keyed by $key) + my $value_sector = $self->load_sector( $tmpvar->_base_offset, 'refs' ); + $sector->write_data({ + key => $key, +# key_md5 => $self->_apply_digest( $key ), + value => $value_sector, + }); + $value_sector->increment_refcount; + + return 1; + } + + $type = substr( $r, 0, 1 ); + $class = 'DBM::Deep::Sector::DBI::Reference'; + } + else { + if ( tied($value) ) { + DBM::Deep->_throw_error( "Cannot store something that is tied." ); + } + + if ( ref $value eq 'DBM::Deep::Null' ) { + DBM::Deep::_warnif( + 'uninitialized', 'Assignment of stale reference' + ); + $value = undef; + } + + $class = 'DBM::Deep::Sector::DBI::Scalar'; + $type = 'S'; + } + + # Create this after loading the reference sector in case something bad + # happens. This way, we won't allocate value sector(s) needlessly. + my $value_sector = $class->new({ + engine => $self, + data => $value, + type => $type, + }); + + $sector->write_data({ + key => $key, +# key_md5 => $self->_apply_digest( $key ), + value => $value_sector, + }); + + $self->_descend( $value, $value_sector ); + + return 1; +} + +#sub begin_work { +# my $self = shift; +# die "Transactions are not supported by this engine" +# unless $self->supports('transactions'); +# +# if ( $self->in_txn ) { +# DBM::Deep->_throw_error( "Cannot begin_work within an active transaction" ); +# } +# +# $self->storage->begin_work; +# +# $self->in_txn( 1 ); +# +# return 1; +#} +# +#sub rollback { +# my $self = shift; +# die "Transactions are not supported by this engine" +# unless $self->supports('transactions'); +# +# if ( !$self->in_txn ) { +# DBM::Deep->_throw_error( "Cannot rollback without an active transaction" ); +# } +# +# $self->storage->rollback; +# +# $self->in_txn( 0 ); +# +# return 1; +#} +# +#sub commit { +# my $self = shift; +# die "Transactions are not supported by this engine" +# unless $self->supports('transactions'); +# +# if ( !$self->in_txn ) { +# DBM::Deep->_throw_error( "Cannot commit without an active transaction" ); +# } +# +# $self->storage->commit; +# +# $self->in_txn( 0 ); +# +# return 1; +#} +# +#sub in_txn { +# my $self = shift; +# $self->{in_txn} = shift if @_; +# $self->{in_txn}; +#} + +sub supports { + my $self = shift; + my ($feature) = @_; + + return if $feature eq 'transactions'; + return 1 if $feature eq 'singletons'; + return; +} + +sub db_version { + return '1.0020' +} + +sub clear { + my $self = shift; + my $obj = shift; + + my $sector = $self->load_sector( $obj->_base_offset, 'refs' ) + or return; + + $sector->clear; + + return; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Engine/File.pm b/src/modules/DBM/Deep/Engine/File.pm new file mode 100644 index 0000000..435e84c --- /dev/null +++ b/src/modules/DBM/Deep/Engine/File.pm @@ -0,0 +1,1191 @@ +package DBM::Deep::Engine::File; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; +no warnings 'recursion'; + +use base qw( DBM::Deep::Engine ); + +use Scalar::Util (); + +use DBM::Deep::Null (); +use DBM::Deep::Sector::File (); +use DBM::Deep::Storage::File (); + +sub sector_type { 'DBM::Deep::Sector::File' } +sub iterator_class { 'DBM::Deep::Iterator::File' } + +my $STALE_SIZE = 2; + +# Setup file and tag signatures. These should never change. +sub SIG_FILE () { 'DPDB' } +sub SIG_HEADER () { 'h' } +sub SIG_NULL () { 'N' } +sub SIG_DATA () { 'D' } +sub SIG_UNIDATA () { 'U' } +sub SIG_INDEX () { 'I' } +sub SIG_BLIST () { 'B' } +sub SIG_FREE () { 'F' } +sub SIG_SIZE () { 1 } +# SIG_HASH and SIG_ARRAY are defined in DBM::Deep::Engine + +# Please refer to the pack() documentation for further information +my %StP = ( + 1 => 'C', # Unsigned char value (no order needed as it's just one byte) + 2 => 'n', # Unsigned short in "network" (big-endian) order + 4 => 'N', # Unsigned long in "network" (big-endian) order + 8 => 'Q', # Usigned quad (no order specified, presumably machine-dependent) +); + +=head1 NAME + +DBM::Deep::Engine::File - engine for use with DBM::Deep::Storage::File + +=head1 PURPOSE + +This is the engine for use with L. + +=head1 EXTERNAL METHODS + +=head2 new() + +This takes a set of args. These args are described in the documentation for +L. + +=cut + +sub new { + my $class = shift; + my ($args) = @_; + + $args->{storage} = DBM::Deep::Storage::File->new( $args ) + unless exists $args->{storage}; + + my $self = bless { + byte_size => 4, + + digest => undef, + hash_size => 16, # In bytes + hash_chars => 256, # Number of chars the algorithm uses per byte + max_buckets => 16, + num_txns => 1, # The HEAD + trans_id => 0, # Default to the HEAD + + data_sector_size => 64, # Size in bytes of each data sector + + entries => {}, # This is the list of entries for transactions + storage => undef, + + external_refs => undef, + }, $class; + + # Never allow byte_size to be set directly. + delete $args->{byte_size}; + if ( defined $args->{pack_size} ) { + if ( lc $args->{pack_size} eq 'small' ) { + $args->{byte_size} = 2; + } + elsif ( lc $args->{pack_size} eq 'medium' ) { + $args->{byte_size} = 4; + } + elsif ( lc $args->{pack_size} eq 'large' ) { + $args->{byte_size} = 8; + } + else { + DBM::Deep->_throw_error( "Unknown pack_size value: '$args->{pack_size}'" ); + } + } + + # Grab the parameters we want to use + foreach my $param ( keys %$self ) { + next unless exists $args->{$param}; + $self->{$param} = $args->{$param}; + } + + my %validations = ( + max_buckets => { floor => 16, ceil => 256 }, + num_txns => { floor => 1, ceil => 255 }, + data_sector_size => { floor => 32, ceil => 256 }, + ); + + while ( my ($attr, $c) = each %validations ) { + if ( !defined $self->{$attr} + || !length $self->{$attr} + || $self->{$attr} =~ /\D/ + || $self->{$attr} < $c->{floor} + ) { + $self->{$attr} = '(undef)' if !defined $self->{$attr}; + warn "Floor of $attr is $c->{floor}. Setting it to $c->{floor} from '$self->{$attr}'\n"; + $self->{$attr} = $c->{floor}; + } + elsif ( $self->{$attr} > $c->{ceil} ) { + warn "Ceiling of $attr is $c->{ceil}. Setting it to $c->{ceil} from '$self->{$attr}'\n"; + $self->{$attr} = $c->{ceil}; + } + } + + if ( !$self->{digest} ) { + require Digest::MD5; + $self->{digest} = \&Digest::MD5::md5; + } + + return $self; +} + +sub read_value { + my $self = shift; + my ($obj, $key) = @_; + + # This will be a Reference sector + my $sector = $self->load_sector( $obj->_base_offset ) + or return; + + if ( $sector->staleness != $obj->_staleness ) { + return; + } + + my $key_md5 = $self->_apply_digest( $key ); + + my $value_sector = $sector->get_data_for({ + key_md5 => $key_md5, + allow_head => 1, + }); + + unless ( $value_sector ) { + return undef + } + + return $value_sector->data; +} + +sub get_classname { + my $self = shift; + my ($obj) = @_; + + # This will be a Reference sector + my $sector = $self->load_sector( $obj->_base_offset ) + or DBM::Deep->_throw_error( "How did get_classname fail (no sector for '$obj')?!" ); + + if ( $sector->staleness != $obj->_staleness ) { + return; + } + + return $sector->get_classname; +} + +sub make_reference { + my $self = shift; + my ($obj, $old_key, $new_key) = @_; + + # This will be a Reference sector + my $sector = $self->load_sector( $obj->_base_offset ) + or DBM::Deep->_throw_error( "How did make_reference fail (no sector for '$obj')?!" ); + + if ( $sector->staleness != $obj->_staleness ) { + return; + } + + my $old_md5 = $self->_apply_digest( $old_key ); + + my $value_sector = $sector->get_data_for({ + key_md5 => $old_md5, + allow_head => 1, + }); + + unless ( $value_sector ) { + $value_sector = DBM::Deep::Sector::File::Null->new({ + engine => $self, + data => undef, + }); + + $sector->write_data({ + key_md5 => $old_md5, + key => $old_key, + value => $value_sector, + }); + } + + if ( $value_sector->isa( 'DBM::Deep::Sector::File::Reference' ) ) { + $sector->write_data({ + key => $new_key, + key_md5 => $self->_apply_digest( $new_key ), + value => $value_sector, + }); + $value_sector->increment_refcount; + } + else { + $sector->write_data({ + key => $new_key, + key_md5 => $self->_apply_digest( $new_key ), + value => $value_sector->clone, + }); + } + + return; +} + +# exists returns '', not undefined. +sub key_exists { + my $self = shift; + my ($obj, $key) = @_; + + # This will be a Reference sector + my $sector = $self->load_sector( $obj->_base_offset ) + or return ''; + + if ( $sector->staleness != $obj->_staleness ) { + return ''; + } + + my $data = $sector->get_data_for({ + key_md5 => $self->_apply_digest( $key ), + allow_head => 1, + }); + + # exists() returns 1 or '' for true/false. + return $data ? 1 : ''; +} + +sub delete_key { + my $self = shift; + my ($obj, $key) = @_; + + my $sector = $self->load_sector( $obj->_base_offset ) + or return; + + if ( $sector->staleness != $obj->_staleness ) { + return; + } + + return $sector->delete_key({ + key_md5 => $self->_apply_digest( $key ), + allow_head => 0, + }); +} + +sub write_value { + my $self = shift; + my ($obj, $key, $value) = @_; + + my $r = Scalar::Util::reftype( $value ) || ''; + { + last if $r eq ''; + last if $r eq 'HASH'; + last if $r eq 'ARRAY'; + + DBM::Deep->_throw_error( + "Storage of references of type '$r' is not supported." + ); + } + + # This will be a Reference sector + my $sector = $self->load_sector( $obj->_base_offset ) + or DBM::Deep->_throw_error( "Cannot write to a deleted spot in DBM::Deep." ); + + if ( $sector->staleness != $obj->_staleness ) { + DBM::Deep->_throw_error( "Cannot write to a deleted spot in DBM::Deep." ); + } + + my ($class, $type); + if ( !defined $value ) { + $class = 'DBM::Deep::Sector::File::Null'; + } + elsif ( ref $value eq 'DBM::Deep::Null' ) { + DBM::Deep::_warnif( + 'uninitialized', 'Assignment of stale reference' + ); + $class = 'DBM::Deep::Sector::File::Null'; + $value = undef; + } + elsif ( $r eq 'ARRAY' || $r eq 'HASH' ) { + my $tmpvar; + if ( $r eq 'ARRAY' ) { + $tmpvar = tied @$value; + } elsif ( $r eq 'HASH' ) { + $tmpvar = tied %$value; + } + + if ( $tmpvar ) { + my $is_dbm_deep = eval { local $SIG{'__DIE__'}; $tmpvar->isa( 'DBM::Deep' ); }; + + unless ( $is_dbm_deep ) { + DBM::Deep->_throw_error( "Cannot store something that is tied." ); + } + + unless ( $tmpvar->_engine->storage == $self->storage ) { + DBM::Deep->_throw_error( "Cannot store values across DBM::Deep files. Please use export() instead." ); + } + + # First, verify if we're storing the same thing to this spot. If we + # are, then this should be a no-op. -EJS, 2008-05-19 + my $loc = $sector->get_data_location_for({ + key_md5 => $self->_apply_digest( $key ), + allow_head => 1, + }); + + if ( defined($loc) && $loc == $tmpvar->_base_offset ) { + return 1; + } + + #XXX Can this use $loc? + my $value_sector = $self->load_sector( $tmpvar->_base_offset ); + $sector->write_data({ + key => $key, + key_md5 => $self->_apply_digest( $key ), + value => $value_sector, + }); + $value_sector->increment_refcount; + + return 1; + } + + $class = 'DBM::Deep::Sector::File::Reference'; + $type = substr( $r, 0, 1 ); + } + else { + if ( tied($value) ) { + DBM::Deep->_throw_error( "Cannot store something that is tied." ); + } + $class = 'DBM::Deep::Sector::File::Scalar'; + } + + # Create this after loading the reference sector in case something bad + # happens. This way, we won't allocate value sector(s) needlessly. + my $value_sector = $class->new({ + engine => $self, + data => $value, + type => $type, + }); + + $sector->write_data({ + key => $key, + key_md5 => $self->_apply_digest( $key ), + value => $value_sector, + }); + + $self->_descend( $value, $value_sector ); + + return 1; +} + +sub setup { + my $self = shift; + my ($obj) = @_; + + # We're opening the file. + unless ( $obj->_base_offset ) { + my $bytes_read = $self->_read_file_header; + + # Creating a new file + unless ( $bytes_read ) { + $self->_write_file_header; + + # 1) Create Array/Hash entry + my $initial_reference = DBM::Deep::Sector::File::Reference->new({ + engine => $self, + type => $obj->_type, + }); + $obj->{base_offset} = $initial_reference->offset; + $obj->{staleness} = $initial_reference->staleness; + + $self->storage->flush; + } + # Reading from an existing file + else { + $obj->{base_offset} = $bytes_read; + my $initial_reference = DBM::Deep::Sector::File::Reference->new({ + engine => $self, + offset => $obj->_base_offset, + }); + unless ( $initial_reference ) { + DBM::Deep->_throw_error("Corrupted file, no master index record"); + } + + unless ($obj->_type eq $initial_reference->type) { + DBM::Deep->_throw_error("File type mismatch"); + } + + $obj->{staleness} = $initial_reference->staleness; + } + } + + $self->storage->set_inode; + + return 1; +} + +sub begin_work { + my $self = shift; + my ($obj) = @_; + + unless ($self->supports('transactions')) { + DBM::Deep->_throw_error( "Cannot begin_work unless transactions are supported" ); + } + + if ( $self->trans_id ) { + DBM::Deep->_throw_error( "Cannot begin_work within an active transaction" ); + } + + my @slots = $self->read_txn_slots; + my $found; + for my $i ( 0 .. $self->num_txns-2 ) { + next if $slots[$i]; + + $slots[$i] = 1; + $self->set_trans_id( $i + 1 ); + $found = 1; + last; + } + unless ( $found ) { + DBM::Deep->_throw_error( "Cannot allocate transaction ID" ); + } + $self->write_txn_slots( @slots ); + + if ( !$self->trans_id ) { + DBM::Deep->_throw_error( "Cannot begin_work - no available transactions" ); + } + + return; +} + +sub rollback { + my $self = shift; + my ($obj) = @_; + + unless ($self->supports('transactions')) { + DBM::Deep->_throw_error( "Cannot rollback unless transactions are supported" ); + } + + if ( !$self->trans_id ) { + DBM::Deep->_throw_error( "Cannot rollback without an active transaction" ); + } + + # Each entry is the file location for a bucket that has a modification for + # this transaction. The entries need to be expunged. + foreach my $entry (@{ $self->get_entries } ) { + # Remove the entry here + my $read_loc = $entry + + $self->hash_size + + $self->byte_size + + $self->byte_size + + ($self->trans_id - 1) * ( $self->byte_size + $STALE_SIZE ); + + my $data_loc = $self->storage->read_at( $read_loc, $self->byte_size ); + $data_loc = unpack( $StP{$self->byte_size}, $data_loc ); + $self->storage->print_at( $read_loc, pack( $StP{$self->byte_size}, 0 ) ); + + if ( $data_loc > 1 ) { + $self->load_sector( $data_loc )->free; + } + } + + $self->clear_entries; + + my @slots = $self->read_txn_slots; + $slots[$self->trans_id-1] = 0; + $self->write_txn_slots( @slots ); + $self->inc_txn_staleness_counter( $self->trans_id ); + $self->set_trans_id( 0 ); + + return 1; +} + +sub commit { + my $self = shift; + my ($obj) = @_; + + unless ($self->supports('transactions')) { + DBM::Deep->_throw_error( "Cannot commit unless transactions are supported" ); + } + + if ( !$self->trans_id ) { + DBM::Deep->_throw_error( "Cannot commit without an active transaction" ); + } + + foreach my $entry (@{ $self->get_entries } ) { + # Overwrite the entry in head with the entry in trans_id + my $base = $entry + + $self->hash_size + + $self->byte_size; + + my $head_loc = $self->storage->read_at( $base, $self->byte_size ); + $head_loc = unpack( $StP{$self->byte_size}, $head_loc ); + + my $spot = $base + $self->byte_size + ($self->trans_id - 1) * ( $self->byte_size + $STALE_SIZE ); + my $trans_loc = $self->storage->read_at( + $spot, $self->byte_size, + ); + + $self->storage->print_at( $base, $trans_loc ); + $self->storage->print_at( + $spot, + pack( $StP{$self->byte_size} . ' ' . $StP{$STALE_SIZE}, (0) x 2 ), + ); + + if ( $head_loc > 1 ) { + $self->load_sector( $head_loc )->free; + } + } + + $self->clear_entries; + + my @slots = $self->read_txn_slots; + $slots[$self->trans_id-1] = 0; + $self->write_txn_slots( @slots ); + $self->inc_txn_staleness_counter( $self->trans_id ); + $self->set_trans_id( 0 ); + + return 1; +} + +=head1 INTERNAL METHODS + +The following methods are internal-use-only to DBM::Deep::Engine::File. + +=cut + +=head2 read_txn_slots() + +This takes no arguments. + +This will return an array with a 1 or 0 in each slot. Each spot represents one +available transaction. If the slot is 1, that transaction is taken. If it is 0, +the transaction is available. + +=cut + +sub read_txn_slots { + my $self = shift; + my $bl = $self->txn_bitfield_len; + my $num_bits = $bl * 8; + return split '', unpack( 'b'.$num_bits, + $self->storage->read_at( + $self->trans_loc, $bl, + ) + ); +} + +=head2 write_txn_slots( @slots ) + +This takes an array of 1's and 0's. This array represents the transaction slots +returned by L. In other words, the following is true: + + @x = read_txn_slots( write_txn_slots( @x ) ); + +(With the obviously missing object referents added back in.) + +=cut + +sub write_txn_slots { + my $self = shift; + my $num_bits = $self->txn_bitfield_len * 8; + $self->storage->print_at( $self->trans_loc, + pack( 'b'.$num_bits, join('', @_) ), + ); +} + +=head2 get_running_txn_ids() + +This takes no arguments. + +This will return an array of taken transaction IDs. This wraps L. + +=cut + +sub get_running_txn_ids { + my $self = shift; + my @transactions = $self->read_txn_slots; + my @trans_ids = map { $_+1 } grep { $transactions[$_] } 0 .. $#transactions; +} + +=head2 get_txn_staleness_counter( $trans_id ) + +This will return the staleness counter for the given transaction ID. Please see +L for more information. + +=cut + +sub get_txn_staleness_counter { + my $self = shift; + my ($trans_id) = @_; + + # Hardcode staleness of 0 for the HEAD + return 0 unless $trans_id; + + return unpack( $StP{$STALE_SIZE}, + $self->storage->read_at( + $self->trans_loc + $self->txn_bitfield_len + $STALE_SIZE * ($trans_id - 1), + $STALE_SIZE, + ) + ); +} + +=head2 inc_txn_staleness_counter( $trans_id ) + +This will increment the staleness counter for the given transaction ID. Please see +L for more information. + +=cut + +sub inc_txn_staleness_counter { + my $self = shift; + my ($trans_id) = @_; + + # Hardcode staleness of 0 for the HEAD + return 0 unless $trans_id; + + $self->storage->print_at( + $self->trans_loc + $self->txn_bitfield_len + $STALE_SIZE * ($trans_id - 1), + pack( $StP{$STALE_SIZE}, $self->get_txn_staleness_counter( $trans_id ) + 1 ), + ); +} + +=head2 get_entries() + +This takes no arguments. + +This returns a list of all the sectors that have been modified by this transaction. + +=cut + +sub get_entries { + my $self = shift; + return [ keys %{ $self->{entries}{$self->trans_id} ||= {} } ]; +} + +=head2 add_entry( $trans_id, $location ) + +This takes a transaction ID and a file location and marks the sector at that +location as having been modified by the transaction identified by $trans_id. + +This returns nothing. + +B: Unlike all the other _entries() methods, there are several cases where +C<< $trans_id != $self->trans_id >> for this method. + +=cut + +sub add_entry { + my $self = shift; + my ($trans_id, $loc) = @_; + + $self->{entries}{$trans_id} ||= {}; + $self->{entries}{$trans_id}{$loc} = undef; +} + +=head2 reindex_entry( $old_loc, $new_loc ) + +This takes two locations (old and new, respectively). If a location that has +been modified by this transaction is subsequently reindexed due to a bucketlist +overflowing, then the entries hash needs to be made aware of this change. + +This returns nothing. + +=cut + +sub reindex_entry { + my $self = shift; + my ($old_loc, $new_loc) = @_; + + TRANS: + while ( my ($trans_id, $locs) = each %{ $self->{entries} } ) { + if ( exists $locs->{$old_loc} ) { + delete $locs->{$old_loc}; + $locs->{$new_loc} = undef; + next TRANS; + } + } +} + +=head2 clear_entries() + +This takes no arguments. It will clear the entries list for the running +transaction. + +This returns nothing. + +=cut + +sub clear_entries { + my $self = shift; + delete $self->{entries}{$self->trans_id}; +} + +=head2 _write_file_header() + +This writes the file header for a new file. This will write the various settings +that set how the file is interpreted. + +=head2 _read_file_header() + +This reads the file header from an existing file. This will read the various +settings that set how the file is interpreted. + +=cut + +{ + my $header_fixed = length( __PACKAGE__->SIG_FILE ) + 1 + 4 + 4; + my $this_file_version = 4; + my $min_file_version = 3; + + sub _write_file_header { + my $self = shift; + + my $nt = $self->num_txns; + my $bl = $self->txn_bitfield_len; + + my $header_var = 1 + 1 + 1 + 1 + $bl + $STALE_SIZE * ($nt - 1) + 3 * $self->byte_size; + + my $loc = $self->storage->request_space( $header_fixed + $header_var ); + + $self->storage->print_at( $loc, + $self->SIG_FILE, + $self->SIG_HEADER, + pack('N', $this_file_version), # At this point, we're at 9 bytes + pack('N', $header_var), # header size + # --- Above is $header_fixed. Below is $header_var + pack('C', $self->byte_size), + + # These shenanigans are to allow a 256 within a C + pack('C', $self->max_buckets - 1), + pack('C', $self->data_sector_size - 1), + + pack('C', $nt), + pack('C' . $bl, 0 ), # Transaction activeness bitfield + pack($StP{$STALE_SIZE}.($nt-1), 0 x ($nt-1) ), # Transaction staleness counters + pack($StP{$self->byte_size}, 0), # Start of free chain (blist size) + pack($StP{$self->byte_size}, 0), # Start of free chain (data size) + pack($StP{$self->byte_size}, 0), # Start of free chain (index size) + ); + + #XXX Set these less fragilely + $self->set_trans_loc( $header_fixed + 4 ); + $self->set_chains_loc( $header_fixed + 4 + $bl + $STALE_SIZE * ($nt-1) ); + + $self->{v} = $this_file_version; + + return; + } + + sub _read_file_header { + my $self = shift; + + my $buffer = $self->storage->read_at( 0, $header_fixed ); + return unless length($buffer); + + my ($file_signature, $sig_header, $file_version, $size) = unpack( + 'A4 A N N', $buffer + ); + + unless ( $file_signature eq $self->SIG_FILE ) { + $self->storage->close; + DBM::Deep->_throw_error( "Signature not found -- file is not a Deep DB" ); + } + + unless ( $sig_header eq $self->SIG_HEADER ) { + $self->storage->close; + DBM::Deep->_throw_error( "Pre-1.00 file version found" ); + } + + if ( $file_version < $min_file_version ) { + $self->storage->close; + DBM::Deep->_throw_error( + "This file version is too old - " + . _guess_version($file_version) . + " - expected " . _guess_version($min_file_version) + . " to " . _guess_version($this_file_version) + ); + } + if ( $file_version > $this_file_version ) { + $self->storage->close; + DBM::Deep->_throw_error( + "This file version is too new - probably " + . _guess_version($file_version) . + " - expected " . _guess_version($min_file_version) + . " to " . _guess_version($this_file_version) + ); + } + $self->{v} = $file_version; + + my $buffer2 = $self->storage->read_at( undef, $size ); + my @values = unpack( 'C C C C', $buffer2 ); + + if ( @values != 4 || grep { !defined } @values ) { + $self->storage->close; + DBM::Deep->_throw_error("Corrupted file - bad header"); + } + + if ($values[3] != $self->{num_txns}) { + warn "num_txns ($self->{num_txns}) is different from the file ($values[3])\n"; + } + + #XXX Add warnings if values weren't set right + @{$self}{qw(byte_size max_buckets data_sector_size num_txns)} = @values; + + # These shenanigans are to allow a 256 within a C + $self->{max_buckets} += 1; + $self->{data_sector_size} += 1; + + my $bl = $self->txn_bitfield_len; + + my $header_var = scalar(@values) + $bl + $STALE_SIZE * ($self->num_txns - 1) + 3 * $self->byte_size; + unless ( $size == $header_var ) { + $self->storage->close; + DBM::Deep->_throw_error( "Unexpected size found ($size <-> $header_var)." ); + } + + $self->set_trans_loc( $header_fixed + scalar(@values) ); + $self->set_chains_loc( $header_fixed + scalar(@values) + $bl + $STALE_SIZE * ($self->num_txns - 1) ); + + return length($buffer) + length($buffer2); + } + + sub _guess_version { + $_[0] == 4 and return 2; + $_[0] == 3 and return '1.0003'; + $_[0] == 2 and return '1.00'; + $_[0] == 1 and return '0.99'; + $_[0] == 0 and return '0.91'; + + return $_[0]-2; + } +} + +=head2 _apply_digest( @stuff ) + +This will apply the digest method (default to Digest::MD5::md5) to the arguments +passed in and return the result. + +=cut + +sub _apply_digest { + my $self = shift; + my $victim = shift; + utf8::encode $victim if $self->{v} >= 4; + return $self->{digest}->($victim); +} + +=head2 _add_free_blist_sector( $offset, $size ) + +=head2 _add_free_data_sector( $offset, $size ) + +=head2 _add_free_index_sector( $offset, $size ) + +These methods are all wrappers around _add_free_sector(), providing the proper +chain offset ($multiple) for the sector type. + +=cut + +sub _add_free_blist_sector { shift->_add_free_sector( 0, @_ ) } +sub _add_free_data_sector { shift->_add_free_sector( 1, @_ ) } +sub _add_free_index_sector { shift->_add_free_sector( 2, @_ ) } + +=head2 _add_free_sector( $multiple, $offset, $size ) + +_add_free_sector() takes the offset into the chains location, the offset of the +sector, and the size of that sector. It will mark the sector as a free sector +and put it into the list of sectors that are free of this type for use later. + +This returns nothing. + +B: $size is unused? + +=cut + +sub _add_free_sector { + my $self = shift; + my ($multiple, $offset, $size) = @_; + + my $chains_offset = $multiple * $self->byte_size; + + my $storage = $self->storage; + + # Increment staleness. + # XXX Can this increment+modulo be done by "&= 0x1" ? + my $staleness = unpack( $StP{$STALE_SIZE}, $storage->read_at( $offset + $self->SIG_SIZE, $STALE_SIZE ) ); + $staleness = ($staleness + 1 ) % ( 2 ** ( 8 * $STALE_SIZE ) ); + $storage->print_at( $offset + $self->SIG_SIZE, pack( $StP{$STALE_SIZE}, $staleness ) ); + + my $old_head = $storage->read_at( $self->chains_loc + $chains_offset, $self->byte_size ); + + $storage->print_at( $self->chains_loc + $chains_offset, + pack( $StP{$self->byte_size}, $offset ), + ); + + # Record the old head in the new sector after the signature and staleness counter + $storage->print_at( $offset + $self->SIG_SIZE + $STALE_SIZE, $old_head ); +} + +=head2 _request_blist_sector( $size ) + +=head2 _request_data_sector( $size ) + +=head2 _request_index_sector( $size ) + +These methods are all wrappers around _request_sector(), providing the proper +chain offset ($multiple) for the sector type. + +=cut + +sub _request_blist_sector { shift->_request_sector( 0, @_ ) } +sub _request_data_sector { shift->_request_sector( 1, @_ ) } +sub _request_index_sector { shift->_request_sector( 2, @_ ) } + +=head2 _request_sector( $multiple $size ) + +This takes the offset into the chains location and the size of that sector. + +This returns the object with the sector. If there is an available free sector of +that type, then it will be reused. If there isn't one, then a new one will be +allocated. + +=cut + +sub _request_sector { + my $self = shift; + my ($multiple, $size) = @_; + + my $chains_offset = $multiple * $self->byte_size; + + my $old_head = $self->storage->read_at( $self->chains_loc + $chains_offset, $self->byte_size ); + my $loc = unpack( $StP{$self->byte_size}, $old_head ); + + # We don't have any free sectors of the right size, so allocate a new one. + unless ( $loc ) { + my $offset = $self->storage->request_space( $size ); + + # Zero out the new sector. This also guarantees correct increases + # in the filesize. + $self->storage->print_at( $offset, chr(0) x $size ); + + return $offset; + } + + # Read the new head after the signature and the staleness counter + my $new_head = $self->storage->read_at( $loc + $self->SIG_SIZE + $STALE_SIZE, $self->byte_size ); + $self->storage->print_at( $self->chains_loc + $chains_offset, $new_head ); + $self->storage->print_at( + $loc + $self->SIG_SIZE + $STALE_SIZE, + pack( $StP{$self->byte_size}, 0 ), + ); + + return $loc; +} + +=head2 ACCESSORS + +The following are readonly attributes. + +=over 4 + +=item * byte_size + +=item * hash_size + +=item * hash_chars + +=item * num_txns + +=item * max_buckets + +=item * blank_md5 + +=item * data_sector_size + +=item * txn_bitfield_len + +=back + +=cut + +sub byte_size { $_[0]{byte_size} } +sub hash_size { $_[0]{hash_size} } +sub hash_chars { $_[0]{hash_chars} } +sub num_txns { $_[0]{num_txns} } +sub max_buckets { $_[0]{max_buckets} } +sub blank_md5 { chr(0) x $_[0]->hash_size } +sub data_sector_size { $_[0]{data_sector_size} } + +# This is a calculated value +sub txn_bitfield_len { + my $self = shift; + unless ( exists $self->{txn_bitfield_len} ) { + my $temp = ($self->num_txns) / 8; + if ( $temp > int( $temp ) ) { + $temp = int( $temp ) + 1; + } + $self->{txn_bitfield_len} = $temp; + } + return $self->{txn_bitfield_len}; +} + +=pod + +The following are read/write attributes. + +=over 4 + +=item * trans_id / set_trans_id( $new_id ) + +=item * trans_loc / set_trans_loc( $new_loc ) + +=item * chains_loc / set_chains_loc( $new_loc ) + +=back + +=cut + +sub trans_id { $_[0]{trans_id} } +sub set_trans_id { $_[0]{trans_id} = $_[1] } + +sub trans_loc { $_[0]{trans_loc} } +sub set_trans_loc { $_[0]{trans_loc} = $_[1] } + +sub chains_loc { $_[0]{chains_loc} } +sub set_chains_loc { $_[0]{chains_loc} = $_[1] } + +sub supports { + my $self = shift; + my ($feature) = @_; + + if ( $feature eq 'transactions' ) { + return $self->num_txns > 1; + } + return 1 if $feature eq 'singletons'; + return 1 if $feature eq 'unicode'; + return; +} + +sub db_version { + return $_[0]{v} == 3 ? '1.0003' : 2; +} + +sub clear { + my $self = shift; + my $obj = shift; + + my $sector = $self->load_sector( $obj->_base_offset ) + or return; + + return unless $sector->staleness == $obj->_staleness; + + $sector->clear; + + return; +} + +=head2 _dump_file() + +This method takes no arguments. It's used to print out a textual representation +of the DBM::Deep DB file. It assumes the file is not-corrupted. + +=cut + +sub _dump_file { + my $self = shift; + + # Read the header + my $spot = $self->_read_file_header(); + + my %types = ( + 0 => 'B', + 1 => 'D', + 2 => 'I', + ); + + my %sizes = ( + 'D' => $self->data_sector_size, + 'B' => DBM::Deep::Sector::File::BucketList->new({engine=>$self,offset=>1})->size, + 'I' => DBM::Deep::Sector::File::Index->new({engine=>$self,offset=>1})->size, + ); + + my $return = ""; + + # Header values + $return .= "NumTxns: " . $self->num_txns . $/; + + # Read the free sector chains + my %sectors; + foreach my $multiple ( 0 .. 2 ) { + $return .= "Chains($types{$multiple}):"; + my $old_loc = $self->chains_loc + $multiple * $self->byte_size; + while ( 1 ) { + my $loc = unpack( + $StP{$self->byte_size}, + $self->storage->read_at( $old_loc, $self->byte_size ), + ); + + # We're now out of free sectors of this kind. + unless ( $loc ) { + last; + } + + $sectors{ $types{$multiple} }{ $loc } = undef; + $old_loc = $loc + $self->SIG_SIZE + $STALE_SIZE; + $return .= " $loc"; + } + $return .= $/; + } + + SECTOR: + while ( $spot < $self->storage->{end} ) { + # Read each sector in order. + my $sector = $self->load_sector( $spot ); + if ( !$sector ) { + # Find it in the free-sectors that were found already + foreach my $type ( keys %sectors ) { + if ( exists $sectors{$type}{$spot} ) { + my $size = $sizes{$type}; + $return .= sprintf "%08d: %s %04d\n", $spot, 'F' . $type, $size; + $spot += $size; + next SECTOR; + } + } + + die "********\n$return\nDidn't find free sector for $spot in chains\n********\n"; + } + else { + $return .= sprintf "%08d: %s %04d", $spot, $sector->type, $sector->size; + if ( $sector->type =~ /^[DU]\z/ ) { + $return .= ' ' . $sector->data; + } + elsif ( $sector->type eq 'A' || $sector->type eq 'H' ) { + $return .= ' REF: ' . $sector->get_refcount; + } + elsif ( $sector->type eq 'B' ) { + foreach my $bucket ( $sector->chopped_up ) { + $return .= "\n "; + $return .= sprintf "%08d", unpack($StP{$self->byte_size}, + substr( $bucket->[-1], $self->hash_size, $self->byte_size), + ); + my $l = unpack( $StP{$self->byte_size}, + substr( $bucket->[-1], + $self->hash_size + $self->byte_size, + $self->byte_size, + ), + ); + $return .= sprintf " %08d", $l; + foreach my $txn ( 0 .. $self->num_txns - 2 ) { + my $l = unpack( $StP{$self->byte_size}, + substr( $bucket->[-1], + $self->hash_size + 2 * $self->byte_size + $txn * ($self->byte_size + $STALE_SIZE), + $self->byte_size, + ), + ); + $return .= sprintf " %08d", $l; + } + } + } + $return .= $/; + + $spot += $sector->size; + } + } + + return $return; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Hash.pm b/src/modules/DBM/Deep/Hash.pm new file mode 100644 index 0000000..39d1503 --- /dev/null +++ b/src/modules/DBM/Deep/Hash.pm @@ -0,0 +1,136 @@ +package DBM::Deep::Hash; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; +no warnings 'recursion'; + +use base 'DBM::Deep'; + +sub _get_self { + # See the note in Array.pm as to why this is commented out. + # eval { local $SIG{'__DIE__'}; tied( %{$_[0]} ) } || $_[0] + + # During global destruction %{$_[0]} might get tied to undef, so we + # need to check that case if tied returns false. + tied %{$_[0]} or local *@, eval { exists $_[0]{_}; 1 } ? $_[0] : undef +} + +sub _repr { return {} } + +sub TIEHASH { + my $class = shift; + my $args = $class->_get_args( @_ ); + + $args->{type} = $class->TYPE_HASH; + + return $class->_init($args); +} + +sub FETCH { + my $self = shift->_get_self; + DBM::Deep->_throw_error( "Cannot use an undefined hash key." ) unless defined $_[0]; + my $key = ($self->_engine->storage->{filter_store_key}) + ? $self->_engine->storage->{filter_store_key}->($_[0]) + : $_[0]; + + return $self->SUPER::FETCH( $key, $_[0] ); +} + +sub STORE { + my $self = shift->_get_self; + DBM::Deep->_throw_error( "Cannot use an undefined hash key." ) unless defined $_[0]; + my $key = ($self->_engine->storage->{filter_store_key}) + ? $self->_engine->storage->{filter_store_key}->($_[0]) + : $_[0]; + my $value = $_[1]; + + return $self->SUPER::STORE( $key, $value, $_[0] ); +} + +sub EXISTS { + my $self = shift->_get_self; + DBM::Deep->_throw_error( "Cannot use an undefined hash key." ) unless defined $_[0]; + my $key = ($self->_engine->storage->{filter_store_key}) + ? $self->_engine->storage->{filter_store_key}->($_[0]) + : $_[0]; + + return $self->SUPER::EXISTS( $key ); +} + +sub DELETE { + my $self = shift->_get_self; + DBM::Deep->_throw_error( "Cannot use an undefined hash key." ) unless defined $_[0]; + my $key = ($self->_engine->storage->{filter_store_key}) + ? $self->_engine->storage->{filter_store_key}->($_[0]) + : $_[0]; + + return $self->SUPER::DELETE( $key, $_[0] ); +} + +# Locate and return first key (in no particular order) +sub FIRSTKEY { + my $self = shift->_get_self; + + $self->lock_shared; + + my $result = $self->_engine->get_next_key( $self ); + + $self->unlock; + + return ($result && $self->_engine->storage->{filter_fetch_key}) + ? $self->_engine->storage->{filter_fetch_key}->($result) + : $result; +} + +# Return next key (in no particular order), given previous one +sub NEXTKEY { + my $self = shift->_get_self; + + my $prev_key = ($self->_engine->storage->{filter_store_key}) + ? $self->_engine->storage->{filter_store_key}->($_[0]) + : $_[0]; + + $self->lock_shared; + + my $result = $self->_engine->get_next_key( $self, $prev_key ); + + $self->unlock; + + return ($result && $self->_engine->storage->{filter_fetch_key}) + ? $self->_engine->storage->{filter_fetch_key}->($result) + : $result; +} + +sub first_key { (shift)->FIRSTKEY(@_) } +sub next_key { (shift)->NEXTKEY(@_) } + +sub _clear { + my $self = shift; + + while ( defined(my $key = $self->first_key) ) { + do { + $self->_engine->delete_key( $self, $key, $key ); + } while defined($key = $self->next_key($key)); + } + + return; +} + +sub _copy_node { + my $self = shift; + my ($db_temp) = @_; + + my $key = $self->first_key(); + while (defined $key) { + my $value = $self->get($key); + $self->_copy_value( \$db_temp->{$key}, $value ); + $key = $self->next_key($key); + } + + return 1; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Internals.pod b/src/modules/DBM/Deep/Internals.pod new file mode 100644 index 0000000..510aa31 --- /dev/null +++ b/src/modules/DBM/Deep/Internals.pod @@ -0,0 +1,347 @@ +=head1 NAME + +DBM::Deep::Internals - Out of date documentation on DBM::Deep internals + +=head1 OUT OF DATE + +This document is out-of-date. It describes an intermediate file format used +during the development from 0.983 to 1.0000. It will be rewritten soon. + +So far, the description of the header format has been updated. + +=head1 DESCRIPTION + +This is a document describing the internal workings of L. It is +not necessary to read this document if you only intend to be a user. This +document is intended for people who either want a deeper understanding of +specifics of how L works or who wish to help program +L. + +=head1 CLASS LAYOUT + +L is broken up into five classes in three inheritance hierarchies. + +=over 4 + +=item * + +L is the parent of L and L. +These classes form the immediate interface to the outside world. They are the +classes that provide the TIE mechanisms as well as the OO methods. + +=item * + +L is the layer that deals with the mechanics of reading +and writing to the file. This is where the logic of the file layout is +handled. + +=item * + +L is the layer that deals with the physical file. As a +singleton that every other object has a reference to, it also provides a place +to handle datastructure-wide items, such as transactions. + +=back + +=head1 FILE LAYOUT + +This describes the 1.0003 and 2.0000 formats, which internally are numbered +3 and 4, respectively. The internal numbers are used in this section. These +two formats are almost identical. + +DBM::Deep uses a tagged file layout. Every section has a tag, a size, and then +the data. + +=head2 File header + +The file header consists of two parts. The first part is a fixed length of +13 bytes: + + DPDB h VVVV SSSS + \ / | \ \ + \/ '---. \ '--- size of the second part of the header + file \ '--- version + signature tag + +=over 4 + +=item * File Signature + +The first four bytes are 'DPDB' in network byte order, signifying that this is +a DBM::Deep file. + +=item * File tag + +A literal ASCII 'h', indicating that this is the header. The file used by +versions prior to 1.00 had a different fifth byte, allowing the difference +to be determined. + +=item * Version + +This is four bytes containing the file version. This lets the file format change over time. + +It is packed in network order, so version 4 is stored as "\0\0\0\cD". + +=item * Header size + +The size of the second part of the header, in bytes. This number is also +packed in network order. + +=back + +The second part of the header is as follows: + + S B S T T(TTTTTTTTT...) (SS SS SS SS ...) (continued...) + | | | | \ | + | | | '----------. \ staleness counters + | | '--------. \ txn bitfield + | '------. \ number of transactions + byte size \ data sector size + max buckets + + (continuation...) + BB(BBBBBB) DD(DDDDDD) II(IIIIII) + | | | + | free data | + free blist free index + +=over + +=item * Constants + +These are the file-wide constants that determine how the file is laid out. +They can only be set upon file creation. + +The byte size is the number of bytes used to point to an offset elsewhere +in the file. This corresponds to the C option. This and the +next three values are stored as packed 8-bit integers (chars), so 2 is +represented by "\cB". + +C and C are documented in the main +L man page. The number stored is actually one less than what is +passed to the constructor, to allow for a range of 1-256. + +The number of transactions corresponds to the C value passed to +the constructor. + +=item * Transaction information + +The transaction bitfield consists of one bit for every available +transaction ID. It is therefore anywhere from 1 byte to 32 bytes long. + +The staleness counters each take two bytes (packed 32-bit integers), one +for each transaction, not including the so-called HEAD (the main +transaction that all processes share I calling C). So +these take up 0 to 508 bytes. + +Staleness is explained in L. + +=item * Freespace information + +Pointers into the first free sectors of the various sector sizes (Index, +Bucketlist, and Data) are stored here. These are called chains internally, +as each free sector points to the next one. + +The number of bytes is determined by the byte size, ranging from 2 to 8. + +=back + +=head2 Index + +The Index parts can be tagged either as Hash, Array, or Index. The latter +is if there was a reindexing due to a bucketlist growing too large. The others +are the root index for their respective datatypes. The index consists of a +tag, a size, and then 256 sections containing file locations. Each section +corresponds to each value representable in a byte. + +The index is used as follows - whenever a hashed key is being looked up, the +first byte is used to determine which location to go to from the root index. +Then, if that's also an index, the second byte is used, and so forth until a +bucketlist is found. + +=head2 Bucketlist + +This is the part that contains the link to the data section. A bucketlist +defaults to being 16 buckets long (modifiable by the I +parameter used when creating a new file). Each bucket contains an MD5 and a +location of the appropriate key section. + +=head2 Key area + +This is the part that handles transactional awareness. There are +I sections. Each section contains the location to the data +section, a transaction ID, and whether that transaction considers this key to +be deleted or not. + +=head2 Data area + +This is the part that actual stores the key, value, and class (if +appropriate). The layout is: + +=over 4 + +=item * tag + +=item * length of the value + +=item * the actual value + +=item * keylength + +=item * the actual key + +=item * a byte indicating if this value has a classname + +=item * the classname (if one is there) + +=back + +The key is stored after the value because the value is requested more often +than the key. + +=head1 PERFORMANCE + +L is written completely in Perl. It also is a multi-process DBM +that uses the datafile as a method of synchronizing between multiple +processes. This is unlike most RDBMSes like MySQL and Oracle. Furthermore, +unlike all RDBMSes, L stores both the data and the structure of +that data as it would appear in a Perl program. + +=head2 CPU + +DBM::Deep attempts to be CPU-light. As it stores all the data on disk, +DBM::Deep is I/O-bound, not CPU-bound. + +=head2 RAM + +DBM::Deep uses extremely little RAM relative to the amount of data you can +access. You can iterate through a million keys (using C) without +increasing your memory usage at all. + +=head2 DISK + +DBM::Deep is I/O-bound, pure and simple. The faster your disk, the faster +DBM::Deep will be. Currently, when performing C{foo}>, there +are a minimum of 4 seeks and 1332 + N bytes read (where N is the length of your +data). (All values assume a medium filesize.) The actions taken are: + +=over 4 + +=item 1 Lock the file + +=item 1 Perform a stat() to determine if the inode has changed + +=item 1 Go to the primary index for the $db (1 seek) + +=item 1 Read the tag/size of the primary index (5 bytes) + +=item 1 Read the body of the primary index (1024 bytes) + +=item 1 Go to the bucketlist for this MD5 (1 seek) + +=item 1 Read the tag/size of the bucketlist (5 bytes) + +=item 1 Read the body of the bucketlist (144 bytes) + +=item 1 Go to the keys location for this MD5 (1 seek) + +=item 1 Read the tag/size of the keys section (5 bytes) + +=item 1 Read the body of the keys location (144 bytes) + +=item 1 Go to the data section that corresponds to this transaction ID. (1 seek) + +=item 1 Read the tag/size of the data section (5 bytes) + +=item 1 Read the value for this data (N bytes) + +=item 1 Unlock the file + +=back + +Every additional level of indexing (if there are enough keys) requires an +additional seek and the reading of 1029 additional bytes. If the value is +blessed, an additional 1 seek and 9 + M bytes are read (where M is the length +of the classname). + +Arrays are (currently) even worse because they're considered "funny hashes" +with the length stored as just another key. This means that if you do any sort +of lookup with a negative index, this entire process is performed twice - once +for the length and once for the value. + +=head1 ACTUAL TESTS + +=head2 SPEED + +Obviously, DBM::Deep isn't going to be as fast as some C-based DBMs, such as +the almighty I. But it makes up for it in features like true +multi-level hash/array support, and cross-platform FTPable files. Even so, +DBM::Deep is still pretty fast, and the speed stays fairly consistent, even +with huge databases. Here is some test data: + + Adding 1,000,000 keys to new DB file... + + At 100 keys, avg. speed is 2,703 keys/sec + At 200 keys, avg. speed is 2,642 keys/sec + At 300 keys, avg. speed is 2,598 keys/sec + At 400 keys, avg. speed is 2,578 keys/sec + At 500 keys, avg. speed is 2,722 keys/sec + At 600 keys, avg. speed is 2,628 keys/sec + At 700 keys, avg. speed is 2,700 keys/sec + At 800 keys, avg. speed is 2,607 keys/sec + At 900 keys, avg. speed is 2,190 keys/sec + At 1,000 keys, avg. speed is 2,570 keys/sec + At 2,000 keys, avg. speed is 2,417 keys/sec + At 3,000 keys, avg. speed is 1,982 keys/sec + At 4,000 keys, avg. speed is 1,568 keys/sec + At 5,000 keys, avg. speed is 1,533 keys/sec + At 6,000 keys, avg. speed is 1,787 keys/sec + At 7,000 keys, avg. speed is 1,977 keys/sec + At 8,000 keys, avg. speed is 2,028 keys/sec + At 9,000 keys, avg. speed is 2,077 keys/sec + At 10,000 keys, avg. speed is 2,031 keys/sec + At 20,000 keys, avg. speed is 1,970 keys/sec + At 30,000 keys, avg. speed is 2,050 keys/sec + At 40,000 keys, avg. speed is 2,073 keys/sec + At 50,000 keys, avg. speed is 1,973 keys/sec + At 60,000 keys, avg. speed is 1,914 keys/sec + At 70,000 keys, avg. speed is 2,091 keys/sec + At 80,000 keys, avg. speed is 2,103 keys/sec + At 90,000 keys, avg. speed is 1,886 keys/sec + At 100,000 keys, avg. speed is 1,970 keys/sec + At 200,000 keys, avg. speed is 2,053 keys/sec + At 300,000 keys, avg. speed is 1,697 keys/sec + At 400,000 keys, avg. speed is 1,838 keys/sec + At 500,000 keys, avg. speed is 1,941 keys/sec + At 600,000 keys, avg. speed is 1,930 keys/sec + At 700,000 keys, avg. speed is 1,735 keys/sec + At 800,000 keys, avg. speed is 1,795 keys/sec + At 900,000 keys, avg. speed is 1,221 keys/sec + At 1,000,000 keys, avg. speed is 1,077 keys/sec + +This test was performed on a PowerMac G4 1gHz running Mac OS X 10.3.2 & Perl +5.8.1, with an 80GB Ultra ATA/100 HD spinning at 7200RPM. The hash keys and +values were between 6 - 12 chars in length. The DB file ended up at 210MB. +Run time was 12 min 3 sec. + +=head2 MEMORY USAGE + +One of the great things about L is that it uses very little memory. +Even with huge databases (1,000,000+ keys) you will not see much increased +memory on your process. L relies solely on the filesystem for storing +and fetching data. Here is output from I before even opening a database +handle: + + PID USER PRI NI SIZE RSS SHARE STAT %CPU %MEM TIME COMMAND + 22831 root 11 0 2716 2716 1296 R 0.0 0.2 0:07 perl + +Basically the process is taking 2,716K of memory. And here is the same +process after storing and fetching 1,000,000 keys: + + PID USER PRI NI SIZE RSS SHARE STAT %CPU %MEM TIME COMMAND + 22831 root 14 0 2772 2772 1328 R 0.0 0.2 13:32 perl + +Notice the memory usage increased by only 56K. Test was performed on a 700mHz +x86 box running Linux RedHat 7.2 & Perl 5.6.1. + +=cut diff --git a/src/modules/DBM/Deep/Iterator.pm b/src/modules/DBM/Deep/Iterator.pm new file mode 100644 index 0000000..f5a4eff --- /dev/null +++ b/src/modules/DBM/Deep/Iterator.pm @@ -0,0 +1,73 @@ +package DBM::Deep::Iterator; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +=head1 NAME + +DBM::Deep::Iterator - iterator for FIRSTKEY() and NEXTKEY() + +=head1 PURPOSE + +This is an internal-use-only object for L. It is the iterator +for FIRSTKEY() and NEXTKEY(). + +=head1 OVERVIEW + +This object + +=head1 METHODS + +=head2 new(\%params) + +The constructor takes a hashref of params. The hashref is assumed to have the +following elements: + +=over 4 + +=item * engine (of type L + +=item * base_offset (the base_offset of the invoking DBM::Deep object) + +=back + +=cut + +sub new { + my $class = shift; + my ($args) = @_; + + my $self = bless { + engine => $args->{engine}, + base_offset => $args->{base_offset}, + }, $class; + + Scalar::Util::weaken( $self->{engine} ); + + $self->reset; + + return $self; +} + +=head2 reset() + +This method takes no arguments. + +It will reset the iterator so that it will start from the beginning again. + +This method returns nothing. + +=cut + +sub reset { die "reset must be implemented in a child class" } + +=head2 get_next_key( $obj ) + +=cut + +sub get_next_key { die "get_next_key must be implemented in a child class" } + +1; +__END__ diff --git a/src/modules/DBM/Deep/Iterator/DBI.pm b/src/modules/DBM/Deep/Iterator/DBI.pm new file mode 100644 index 0000000..0aecbe8 --- /dev/null +++ b/src/modules/DBM/Deep/Iterator/DBI.pm @@ -0,0 +1,37 @@ +package DBM::Deep::Iterator::DBI; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Iterator ); + +sub reset { + my $self = shift; + + eval { $self->{sth}->finish; }; + delete $self->{sth}; + + return; +} + +sub get_next_key { + my $self = shift; + my ($obj) = @_; + + unless ( exists $self->{sth} ) { + # For mysql, this needs to be RAND() + # For sqlite, this needs to be random() + my $storage = $self->{engine}->storage; + $self->{sth} = $storage->{dbh}->prepare( + "SELECT `key` FROM datas WHERE ref_id = ? ORDER BY " + . $storage->rand_function, + ); + $self->{sth}->execute( $self->{base_offset} ); + } + + my ($key) = $self->{sth}->fetchrow_array; + return $key; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Iterator/File.pm b/src/modules/DBM/Deep/Iterator/File.pm new file mode 100644 index 0000000..b38ed94 --- /dev/null +++ b/src/modules/DBM/Deep/Iterator/File.pm @@ -0,0 +1,104 @@ +package DBM::Deep::Iterator::File; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Iterator ); + +use DBM::Deep::Iterator::File::BucketList (); +use DBM::Deep::Iterator::File::Index (); + +sub reset { $_[0]{breadcrumbs} = []; return } + +sub get_sector_iterator { + my $self = shift; + my ($loc) = @_; + + my $sector = $self->{engine}->load_sector( $loc ) + or return; + + if ( $sector->isa( 'DBM::Deep::Sector::File::Index' ) ) { + return DBM::Deep::Iterator::File::Index->new({ + iterator => $self, + sector => $sector, + }); + } + elsif ( $sector->isa( 'DBM::Deep::Sector::File::BucketList' ) ) { + return DBM::Deep::Iterator::File::BucketList->new({ + iterator => $self, + sector => $sector, + }); + } + + DBM::Deep->_throw_error( "get_sector_iterator(): Why did $loc make a $sector?" ); +} + +sub get_next_key { + my $self = shift; + my ($obj) = @_; + + my $crumbs = $self->{breadcrumbs}; + my $e = $self->{engine}; + + unless ( @$crumbs ) { + # This will be a Reference sector + my $sector = $e->load_sector( $self->{base_offset} ) + # If no sector is found, this must have been deleted from under us. + or return; + + if ( $sector->staleness != $obj->_staleness ) { + return; + } + + my $loc = $sector->get_blist_loc + or return; + + push @$crumbs, $self->get_sector_iterator( $loc ); + } + + FIND_NEXT_KEY: { + # We're at the end. + unless ( @$crumbs ) { + $self->reset; + return; + } + + my $iterator = $crumbs->[-1]; + + # This level is done. + if ( $iterator->at_end ) { + pop @$crumbs; + redo FIND_NEXT_KEY; + } + + if ( $iterator->isa( 'DBM::Deep::Iterator::File::Index' ) ) { + # If we don't have any more, it will be caught at the + # prior check. + if ( my $next = $iterator->get_next_iterator ) { + push @$crumbs, $next; + } + redo FIND_NEXT_KEY; + } + + unless ( $iterator->isa( 'DBM::Deep::Iterator::File::BucketList' ) ) { + DBM::Deep->_throw_error( + "Should have a bucketlist iterator here - instead have $iterator" + ); + } + + # At this point, we have a BucketList iterator + my $key = $iterator->get_next_key; + if ( defined $key ) { + return $key; + } + #XXX else { $iterator->set_to_end() } ? + + # We hit the end of the bucketlist iterator, so redo + redo FIND_NEXT_KEY; + } + + DBM::Deep->_throw_error( "get_next_key(): How did we get here?" ); +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Iterator/File/BucketList.pm b/src/modules/DBM/Deep/Iterator/File/BucketList.pm new file mode 100644 index 0000000..d62b9e9 --- /dev/null +++ b/src/modules/DBM/Deep/Iterator/File/BucketList.pm @@ -0,0 +1,90 @@ +package DBM::Deep::Iterator::File::BucketList; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +=head1 NAME + +DBM::Deep::Iterator::BucketList - mediate between DBM::Deep::Iterator and DBM::Deep::Engine::Sector::BucketList + +=head1 PURPOSE + +This is an internal-use-only object for L. It acts as the mediator +between the L object and a L +sector. + +=head1 OVERVIEW + +This object, despite the implied class hierarchy, does B inherit from +L. Instead, it delegates to it, essentially acting as a +facade over it. L will instantiate one of +these objects as needed to handle an BucketList sector. + +=head1 METHODS + +=head2 new(\%params) + +The constructor takes a hashref of params and blesses it into the invoking class. The +hashref is assumed to have the following elements: + +=over 4 + +=item * iterator (of type L + +=item * sector (of type L + +=back + +=cut + +sub new { + my $self = bless $_[1] => $_[0]; + $self->{curr_index} = 0; + return $self; +} + +=head2 at_end() + +This takes no arguments. + +This returns true/false indicating whether this sector has any more elements that can be +iterated over. + +=cut + +sub at_end { + my $self = shift; + return $self->{curr_index} >= $self->{iterator}{engine}->max_buckets; +} + +=head2 get_next_iterator() + +This takes no arguments. + +This returns the next key pointed to by this bucketlist. This value is suitable for +returning by FIRSTKEY or NEXTKEY(). + +If the bucketlist is exhausted, it returns nothing. + +=cut + +sub get_next_key { + my $self = shift; + + return if $self->at_end; + + my $idx = $self->{curr_index}++; + + my $data_loc = $self->{sector}->get_data_location_for({ + allow_head => 1, + idx => $idx, + }) or return; + + #XXX Do we want to add corruption checks here? + return $self->{sector}->get_key_for( $idx )->data; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Iterator/File/Index.pm b/src/modules/DBM/Deep/Iterator/File/Index.pm new file mode 100644 index 0000000..d2f6611 --- /dev/null +++ b/src/modules/DBM/Deep/Iterator/File/Index.pm @@ -0,0 +1,86 @@ +package DBM::Deep::Iterator::File::Index; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +=head1 NAME + +DBM::Deep::Iterator::Index - mediate between DBM::Deep::Iterator and DBM::Deep::Engine::Sector::Index + +=head1 PURPOSE + +This is an internal-use-only object for L. It acts as the mediator +between the L object and a L +sector. + +=head1 OVERVIEW + +This object, despite the implied class hierarchy, does B inherit from +L. Instead, it delegates to it, essentially acting as a +facade over it. L will instantiate one of +these objects as needed to handle an Index sector. + +=head1 METHODS + +=head2 new(\%params) + +The constructor takes a hashref of params and blesses it into the invoking class. The +hashref is assumed to have the following elements: + +=over 4 + +=item * iterator (of type L + +=item * sector (of type L + +=back + +=cut + +sub new { + my $self = bless $_[1] => $_[0]; + $self->{curr_index} = 0; + return $self; +} + +=head2 at_end() + +This takes no arguments. + +This returns true/false indicating whether this sector has any more elements that can be +iterated over. + +=cut + +sub at_end { + my $self = shift; + return $self->{curr_index} >= $self->{iterator}{engine}->hash_chars; +} + +=head2 get_next_iterator() + +This takes no arguments. + +This returns an iterator (built by L) based +on the sector pointed to by the next occupied location in this index. + +If the sector is exhausted, it returns nothing. + +=cut + +sub get_next_iterator { + my $self = shift; + + my $loc; + while ( !$loc ) { + return if $self->at_end; + $loc = $self->{sector}->get_entry( $self->{curr_index}++ ); + } + + return $self->{iterator}->get_sector_iterator( $loc ); +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Null.pm b/src/modules/DBM/Deep/Null.pm new file mode 100644 index 0000000..a521671 --- /dev/null +++ b/src/modules/DBM/Deep/Null.pm @@ -0,0 +1,49 @@ +package DBM::Deep::Null; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +=head1 NAME + +DBM::Deep::Null - NULL object + +=head1 PURPOSE + +This is an internal-use-only object for L. It acts as a NULL object +in the same vein as MARCEL's L. I couldn't use L +because DBM::Deep needed an object that always evaluated as undef, not an +implementation of the Null Class pattern. + +=head1 OVERVIEW + +It is used to represent null sectors in DBM::Deep. + +=cut + +use overload + 'bool' => sub { undef }, + '""' => sub { undef }, + '0+' => sub { 0 }, + ('cmp' => + '<=>' => sub { + return 0 if !defined $_[1] || !length $_[1]; + return $_[2] ? 1 : -1; + } + )[0,2,1,2], # same sub for both ops + '%{}' => sub { + require Carp; + Carp::croak("Can't use a stale reference as a HASH"); + }, + '@{}' => sub { + require Carp; + Carp::croak("Can't use a stale reference as an ARRAY"); + }, + fallback => 1, + nomethod => 'AUTOLOAD'; + +sub AUTOLOAD { return; } + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector.pm b/src/modules/DBM/Deep/Sector.pm new file mode 100644 index 0000000..f394397 --- /dev/null +++ b/src/modules/DBM/Deep/Sector.pm @@ -0,0 +1,37 @@ +package DBM::Deep::Sector; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use Scalar::Util (); + +sub new { + my $self = bless $_[1], $_[0]; + Scalar::Util::weaken( $self->{engine} ); + $self->_init; + return $self; +} + +sub _init {} + +sub clone { + my $self = shift; + return ref($self)->new({ + engine => $self->engine, + type => $self->type, + data => $self->data, + }); +} + + +sub engine { $_[0]{engine} } +sub offset { $_[0]{offset} } +sub type { $_[0]{type} } +sub staleness { $_[0]{staleness} } + +sub load { die "load must be implemented in a child class" } + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/DBI.pm b/src/modules/DBM/Deep/Sector/DBI.pm new file mode 100644 index 0000000..ec6ecdf --- /dev/null +++ b/src/modules/DBM/Deep/Sector/DBI.pm @@ -0,0 +1,55 @@ +package DBM::Deep::Sector::DBI; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector ); + +use DBM::Deep::Sector::DBI::Reference (); +use DBM::Deep::Sector::DBI::Scalar (); + +sub free { + my $self = shift; + + $self->engine->storage->delete_from( + $self->table, $self->offset, + ); +} + +sub reload { + my $self = shift; + $self->_init; +} + +sub load { + my $self = shift; + my ($engine, $offset, $type) = @_; + + if ( !defined $type || $type eq 'refs' ) { + return DBM::Deep::Sector::DBI::Reference->new({ + engine => $engine, + offset => $offset, + }); + } + elsif ( $type eq 'datas' ) { + my $sector = DBM::Deep::Sector::DBI::Scalar->new({ + engine => $engine, + offset => $offset, + }); + + if ( $sector->{data_type} eq 'R' ) { + return $self->load( + $engine, $sector->{value}, 'refs', + ); + } + + return $sector; + } + + DBM::Deep->_throw_error( "'$offset': Don't know what to do with type '$type'" ); +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/DBI/Reference.pm b/src/modules/DBM/Deep/Sector/DBI/Reference.pm new file mode 100644 index 0000000..ff828eb --- /dev/null +++ b/src/modules/DBM/Deep/Sector/DBI/Reference.pm @@ -0,0 +1,238 @@ +package DBM::Deep::Sector::DBI::Reference; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base 'DBM::Deep::Sector::DBI'; + +use Scalar::Util; + +sub table { 'refs' } + +sub _init { + my $self = shift; + + my $e = $self->engine; + + unless ( $self->offset ) { + my $classname = Scalar::Util::blessed( delete $self->{data} ); + $self->{offset} = $self->engine->storage->write_to( + refs => undef, + ref_type => $self->type, + classname => $classname, + ); + } + else { + my ($rows) = $self->engine->storage->read_from( + refs => $self->offset, + qw( ref_type ), + ); + + $self->{type} = $rows->[0]{ref_type}; + } + + return; +} + +sub get_data_for { + my $self = shift; + my ($args) = @_; + + my ($rows) = $self->engine->storage->read_from( + datas => { ref_id => $self->offset, key => $args->{key} }, + qw( id ), + ); + + return unless $rows->[0]{id}; + + $self->load( + $self->engine, + $rows->[0]{id}, + 'datas', + ); +} + +sub write_data { + my $self = shift; + my ($args) = @_; + + if ( ( $args->{value}->type || 'S' ) eq 'S' ) { + $args->{value}{offset} = $self->engine->storage->write_to( + datas => $args->{value}{offset}, + ref_id => $self->offset, + data_type => 'S', + key => $args->{key}, + value => $args->{value}{data}, + ); + + $args->{value}->reload; + } + else { + # Write the Scalar of the Reference + $self->engine->storage->write_to( + datas => undef, + ref_id => $self->offset, + data_type => 'R', + key => $args->{key}, + value => $args->{value}{offset}, + ); + } +} + +sub delete_key { + my $self = shift; + my ($args) = @_; + + my $old_value = $self->get_data_for({ + key => $args->{key}, + }); + + my $data; + if ( $old_value ) { + $data = $old_value->data({ export => 1 }); + + $self->engine->storage->delete_from( + 'datas', + { ref_id => $self->offset, + key => $args->{key}, }, + ); + $old_value->free; + } + + return $data; +} + +sub get_classname { + my $self = shift; + my ($rows) = $self->engine->storage->read_from( + 'refs', $self->offset, + qw( classname ), + ); + return unless @$rows; + return $rows->[0]{classname}; +} + +# Look to hoist this method into a ::Reference trait +sub data { + my $self = shift; + my ($args) = @_; + $args ||= {}; + + my $engine = $self->engine; + my $cache = $engine->cache; + my $off = $self->offset; + my $obj; + if ( !defined $cache->{ $off } ) { + $obj = DBM::Deep->new({ + type => $self->type, + base_offset => $self->offset, + storage => $engine->storage, + engine => $engine, + }); + + $cache->{$off} = $obj; + Scalar::Util::weaken($cache->{$off}); + } + else { + $obj = $cache->{$off}; + } + + # We're not exporting, so just return. + unless ( $args->{export} ) { + if ( $engine->storage->{autobless} ) { + my $classname = $self->get_classname; + if ( defined $classname ) { + bless $obj, $classname; + } + } + + return $obj; + } + + # We shouldn't export if this is still referred to. + if ( $self->get_refcount > 1 ) { + return $obj; + } + + return $obj->export; +} + +sub free { + my $self = shift; + + # We're not ready to be removed yet. + return if $self->decrement_refcount > 0; + + # Rebless the object into DBM::Deep::Null. + # In external_refs mode, this will already have been removed from + # the cache, so we can skip this. + my $e = $self->engine; + if(!$e->{external_refs}) { + eval { %{ $e->cache->{ $self->offset } } = (); }; + eval { @{ $e->cache->{ $self->offset } } = (); }; + bless $e->cache->{ $self->offset }, 'DBM::Deep::Null'; + delete $e->cache->{ $self->offset }; + } + + $e->storage->delete_from( + 'datas', { ref_id => $self->offset }, + ); + + $e->storage->delete_from( + 'datas', { value => $self->offset, data_type => 'R' }, + ); + + $self->SUPER::free( @_ ); +} + +sub increment_refcount { + my $self = shift; + my $refcount = $self->get_refcount; + $refcount++; + $self->write_refcount( $refcount ); + return $refcount; +} + +sub decrement_refcount { + my $self = shift; + my $refcount = $self->get_refcount; + $refcount--; + $self->write_refcount( $refcount ); + return $refcount; +} + +sub get_refcount { + my $self = shift; + my ($rows) = $self->engine->storage->read_from( + 'refs', $self->offset, + qw( refcount ), + ); + return $rows->[0]{refcount}; +} + +sub write_refcount { + my $self = shift; + my ($num) = @_; + $self->engine->storage->{dbh}->do( + "UPDATE refs SET refcount = ? WHERE id = ?", undef, + $num, $self->offset, + ); +} + +sub clear { + my $self = shift; + + DBM::Deep->new({ + type => $self->type, + base_offset => $self->offset, + storage => $self->engine->storage, + engine => $self->engine, + })->_clear; + + return; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/DBI/Scalar.pm b/src/modules/DBM/Deep/Sector/DBI/Scalar.pm new file mode 100644 index 0000000..276e66c --- /dev/null +++ b/src/modules/DBM/Deep/Sector/DBI/Scalar.pm @@ -0,0 +1,31 @@ +package DBM::Deep::Sector::DBI::Scalar; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector::DBI ); + +sub table { 'datas' } + +sub _init { + my $self = shift; + + if ( $self->offset ) { + my ($rows) = $self->engine->storage->read_from( + datas => $self->offset, + qw( id data_type key value ), + ); + + $self->{$_} = $rows->[0]{$_} for qw( data_type key value ); + } + + return; +} + +sub data { + my $self = shift; + $self->{value}; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/File.pm b/src/modules/DBM/Deep/Sector/File.pm new file mode 100644 index 0000000..c93b9d8 --- /dev/null +++ b/src/modules/DBM/Deep/Sector/File.pm @@ -0,0 +1,104 @@ +package DBM::Deep::Sector::File; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector ); + +use DBM::Deep::Sector::File::BucketList (); +use DBM::Deep::Sector::File::Index (); +use DBM::Deep::Sector::File::Null (); +use DBM::Deep::Sector::File::Reference (); +use DBM::Deep::Sector::File::Scalar (); + +my $STALE_SIZE = 2; + +sub base_size { + my $self = shift; + return $self->engine->SIG_SIZE + $STALE_SIZE; +} + +sub free_meth { die "free_meth must be implemented in a child class" } + +sub free { + my $self = shift; + + my $e = $self->engine; + + $e->storage->print_at( $self->offset, $e->SIG_FREE ); + # Skip staleness counter + $e->storage->print_at( $self->offset + $self->base_size, + chr(0) x ($self->size - $self->base_size), + ); + + my $free_meth = $self->free_meth; + $e->$free_meth( $self->offset, $self->size ); + + return; +} + +#=head2 load( $offset ) +# +#This will instantiate and return the sector object that represents the data +#found at $offset. +# +#=cut + +sub load { + my $self = shift; + my ($engine, $offset) = @_; + + # Add a catch for offset of 0 or 1 + return if !$offset || $offset <= 1; + + my $type = $engine->storage->read_at( $offset, 1 ); + return if $type eq chr(0); + + if ( $type eq $engine->SIG_ARRAY || $type eq $engine->SIG_HASH ) { + return DBM::Deep::Sector::File::Reference->new({ + engine => $engine, + type => $type, + offset => $offset, + }); + } + # XXX Don't we need key_md5 here? + elsif ( $type eq $engine->SIG_BLIST ) { + return DBM::Deep::Sector::File::BucketList->new({ + engine => $engine, + type => $type, + offset => $offset, + }); + } + elsif ( $type eq $engine->SIG_INDEX ) { + return DBM::Deep::Sector::File::Index->new({ + engine => $engine, + type => $type, + offset => $offset, + }); + } + elsif ( $type eq $engine->SIG_NULL ) { + return DBM::Deep::Sector::File::Null->new({ + engine => $engine, + type => $type, + offset => $offset, + }); + } + elsif ( $type eq $engine->SIG_DATA || $type eq $engine->SIG_UNIDATA ) { + return DBM::Deep::Sector::File::Scalar->new({ + engine => $engine, + type => $type, + offset => $offset, + }); + } + # This was deleted from under us, so just return and let the caller figure it out. + elsif ( $type eq $engine->SIG_FREE ) { + return; + } + + DBM::Deep->_throw_error( "'$offset': Don't know what to do with type '$type'" ); +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/File/BucketList.pm b/src/modules/DBM/Deep/Sector/File/BucketList.pm new file mode 100644 index 0000000..3cb9aeb --- /dev/null +++ b/src/modules/DBM/Deep/Sector/File/BucketList.pm @@ -0,0 +1,376 @@ +package DBM::Deep::Sector::File::BucketList; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector::File ); + +my $STALE_SIZE = 2; + +# Please refer to the pack() documentation for further information +my %StP = ( + 1 => 'C', # Unsigned char value (no order needed as it's just one byte) + 2 => 'n', # Unsigned short in "network" (big-endian) order + 4 => 'N', # Unsigned long in "network" (big-endian) order + 8 => 'Q', # Usigned quad (no order specified, presumably machine-dependent) +); + +sub _init { + my $self = shift; + + my $engine = $self->engine; + + unless ( $self->offset ) { + my $leftover = $self->size - $self->base_size; + + $self->{offset} = $engine->_request_blist_sector( $self->size ); + $engine->storage->print_at( $self->offset, $engine->SIG_BLIST ); # Sector type + # Skip staleness counter + $engine->storage->print_at( $self->offset + $self->base_size, + chr(0) x $leftover, # Zero-fill the data + ); + } + + if ( $self->{key_md5} ) { + $self->find_md5; + } + + return $self; +} + +sub wipe { + my $self = shift; + $self->engine->storage->print_at( $self->offset + $self->base_size, + chr(0) x ($self->size - $self->base_size), # Zero-fill the data + ); +} + +sub size { + my $self = shift; + unless ( $self->{size} ) { + my $e = $self->engine; + # Base + numbuckets * bucketsize + $self->{size} = $self->base_size + $e->max_buckets * $self->bucket_size; + } + return $self->{size}; +} + +sub free_meth { '_add_free_blist_sector' } + +sub free { + my $self = shift; + + my $e = $self->engine; + foreach my $bucket ( $self->chopped_up ) { + my $rest = $bucket->[-1]; + + # Delete the keysector + my $l = unpack( $StP{$e->byte_size}, substr( $rest, $e->hash_size, $e->byte_size ) ); + my $s = $e->load_sector( $l ); $s->free if $s; + + # Delete the HEAD sector + $l = unpack( $StP{$e->byte_size}, + substr( $rest, + $e->hash_size + $e->byte_size, + $e->byte_size, + ), + ); + $s = $e->load_sector( $l ); $s->free if $s; + + foreach my $txn ( 0 .. $e->num_txns - 2 ) { + my $l = unpack( $StP{$e->byte_size}, + substr( $rest, + $e->hash_size + 2 * $e->byte_size + $txn * ($e->byte_size + $STALE_SIZE), + $e->byte_size, + ), + ); + my $s = $e->load_sector( $l ); $s->free if $s; + } + } + + $self->SUPER::free(); +} + +sub bucket_size { + my $self = shift; + unless ( $self->{bucket_size} ) { + my $e = $self->engine; + # Key + head (location) + transactions (location + staleness-counter) + my $location_size = $e->byte_size + $e->byte_size + ($e->num_txns - 1) * ($e->byte_size + $STALE_SIZE); + $self->{bucket_size} = $e->hash_size + $location_size; + } + return $self->{bucket_size}; +} + +# XXX This is such a poor hack. I need to rethink this code. +sub chopped_up { + my $self = shift; + + my $e = $self->engine; + + my @buckets; + foreach my $idx ( 0 .. $e->max_buckets - 1 ) { + my $spot = $self->offset + $self->base_size + $idx * $self->bucket_size; + my $md5 = $e->storage->read_at( $spot, $e->hash_size ); + + #XXX If we're chopping, why would we ever have the blank_md5? + last if $md5 eq $e->blank_md5; + + my $rest = $e->storage->read_at( undef, $self->bucket_size - $e->hash_size ); + push @buckets, [ $spot, $md5 . $rest ]; + } + + return @buckets; +} + +sub write_at_next_open { + my $self = shift; + my ($entry) = @_; + + #XXX This is such a hack! + $self->{_next_open} = 0 unless exists $self->{_next_open}; + + my $spot = $self->offset + $self->base_size + $self->{_next_open}++ * $self->bucket_size; + $self->engine->storage->print_at( $spot, $entry ); + + return $spot; +} + +sub has_md5 { + my $self = shift; + unless ( exists $self->{found} ) { + $self->find_md5; + } + return $self->{found}; +} + +sub find_md5 { + my $self = shift; + + $self->{found} = undef; + $self->{idx} = -1; + + if ( @_ ) { + $self->{key_md5} = shift; + } + + # If we don't have an MD5, then what are we supposed to do? + unless ( exists $self->{key_md5} ) { + DBM::Deep->_throw_error( "Cannot find_md5 without a key_md5 set" ); + } + + my $e = $self->engine; + foreach my $idx ( 0 .. $e->max_buckets - 1 ) { + my $potential = $e->storage->read_at( + $self->offset + $self->base_size + $idx * $self->bucket_size, $e->hash_size, + ); + + if ( $potential eq $e->blank_md5 ) { + $self->{idx} = $idx; + return; + } + + if ( $potential eq $self->{key_md5} ) { + $self->{found} = 1; + $self->{idx} = $idx; + return; + } + } + + return; +} + +sub write_md5 { + my $self = shift; + my ($args) = @_; + + DBM::Deep->_throw_error( "write_md5: no key" ) unless exists $args->{key}; + DBM::Deep->_throw_error( "write_md5: no key_md5" ) unless exists $args->{key_md5}; + DBM::Deep->_throw_error( "write_md5: no value" ) unless exists $args->{value}; + + my $engine = $self->engine; + + $args->{trans_id} = $engine->trans_id unless exists $args->{trans_id}; + + my $spot = $self->offset + $self->base_size + $self->{idx} * $self->bucket_size; + $engine->add_entry( $args->{trans_id}, $spot ); + + unless ($self->{found}) { + my $key_sector = DBM::Deep::Sector::File::Scalar->new({ + engine => $engine, + data => $args->{key}, + }); + + $engine->storage->print_at( $spot, + $args->{key_md5}, + pack( $StP{$engine->byte_size}, $key_sector->offset ), + ); + } + + my $loc = $spot + + $engine->hash_size + + $engine->byte_size; + + if ( $args->{trans_id} ) { + $loc += $engine->byte_size + ($args->{trans_id} - 1) * ( $engine->byte_size + $STALE_SIZE ); + + $engine->storage->print_at( $loc, + pack( $StP{$engine->byte_size}, $args->{value}->offset ), + pack( $StP{$STALE_SIZE}, $engine->get_txn_staleness_counter( $args->{trans_id} ) ), + ); + } + else { + $engine->storage->print_at( $loc, + pack( $StP{$engine->byte_size}, $args->{value}->offset ), + ); + } +} + +sub mark_deleted { + my $self = shift; + my ($args) = @_; + $args ||= {}; + + my $engine = $self->engine; + + $args->{trans_id} = $engine->trans_id unless exists $args->{trans_id}; + + my $spot = $self->offset + $self->base_size + $self->{idx} * $self->bucket_size; + $engine->add_entry( $args->{trans_id}, $spot ); + + my $loc = $spot + + $engine->hash_size + + $engine->byte_size; + + if ( $args->{trans_id} ) { + $loc += $engine->byte_size + ($args->{trans_id} - 1) * ( $engine->byte_size + $STALE_SIZE ); + + $engine->storage->print_at( $loc, + pack( $StP{$engine->byte_size}, 1 ), # 1 is the marker for deleted + pack( $StP{$STALE_SIZE}, $engine->get_txn_staleness_counter( $args->{trans_id} ) ), + ); + } + else { + $engine->storage->print_at( $loc, + pack( $StP{$engine->byte_size}, 1 ), # 1 is the marker for deleted + ); + } +} + +sub delete_md5 { + my $self = shift; + my ($args) = @_; + + my $engine = $self->engine; + return undef unless $self->{found}; + + # Save the location so that we can free the data + my $location = $self->get_data_location_for({ + allow_head => 0, + }); + my $key_sector = $self->get_key_for; + + my $spot = $self->offset + $self->base_size + $self->{idx} * $self->bucket_size; + $engine->storage->print_at( $spot, + $engine->storage->read_at( + $spot + $self->bucket_size, + $self->bucket_size * ( $engine->max_buckets - $self->{idx} - 1 ), + ), + chr(0) x $self->bucket_size, + ); + + $key_sector->free; + + my $data_sector = $self->engine->load_sector( $location ); + my $data = $data_sector->data({ export => 1 }); + $data_sector->free; + + return $data; +} + +sub get_data_location_for { + my $self = shift; + my ($args) = @_; + $args ||= {}; + + $args->{allow_head} = 0 unless exists $args->{allow_head}; + $args->{trans_id} = $self->engine->trans_id unless exists $args->{trans_id}; + $args->{idx} = $self->{idx} unless exists $args->{idx}; + + my $e = $self->engine; + + my $spot = $self->offset + $self->base_size + + $args->{idx} * $self->bucket_size + + $e->hash_size + + $e->byte_size; + + if ( $args->{trans_id} ) { + $spot += $e->byte_size + ($args->{trans_id} - 1) * ( $e->byte_size + $STALE_SIZE ); + } + + my $buffer = $e->storage->read_at( + $spot, + $e->byte_size + $STALE_SIZE, + ); + my ($loc, $staleness) = unpack( $StP{$e->byte_size} . ' ' . $StP{$STALE_SIZE}, $buffer ); + + # XXX Merge the two if-clauses below + if ( $args->{trans_id} ) { + # We have found an entry that is old, so get rid of it + if ( $staleness != (my $s = $e->get_txn_staleness_counter( $args->{trans_id} ) ) ) { + $e->storage->print_at( + $spot, + pack( $StP{$e->byte_size} . ' ' . $StP{$STALE_SIZE}, (0) x 2 ), + ); + $loc = 0; + } + } + + # If we're in a transaction and we never wrote to this location, try the + # HEAD instead. + if ( $args->{trans_id} && !$loc && $args->{allow_head} ) { + return $self->get_data_location_for({ + trans_id => 0, + allow_head => 1, + idx => $args->{idx}, + }); + } + + return $loc <= 1 ? 0 : $loc; +} + +sub get_data_for { + my $self = shift; + my ($args) = @_; + $args ||= {}; + + return unless $self->{found}; + my $location = $self->get_data_location_for({ + allow_head => $args->{allow_head}, + }); + return $self->engine->load_sector( $location ); +} + +sub get_key_for { + my $self = shift; + my ($idx) = @_; + $idx = $self->{idx} unless defined $idx; + + if ( $idx >= $self->engine->max_buckets ) { + DBM::Deep->_throw_error( "get_key_for(): Attempting to retrieve $idx" ); + } + + my $location = $self->engine->storage->read_at( + $self->offset + $self->base_size + $idx * $self->bucket_size + $self->engine->hash_size, + $self->engine->byte_size, + ); + $location = unpack( $StP{$self->engine->byte_size}, $location ); + DBM::Deep->_throw_error( "get_key_for: No location?" ) unless $location; + + return $self->engine->load_sector( $location ); +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/File/Data.pm b/src/modules/DBM/Deep/Sector/File/Data.pm new file mode 100644 index 0000000..9838a5f --- /dev/null +++ b/src/modules/DBM/Deep/Sector/File/Data.pm @@ -0,0 +1,15 @@ +package DBM::Deep::Sector::File::Data; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector::File ); + +# This is in bytes +sub size { $_[0]{engine}->data_sector_size } +sub free_meth { return '_add_free_data_sector' } + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/File/Index.pm b/src/modules/DBM/Deep/Sector/File/Index.pm new file mode 100644 index 0000000..62b46f0 --- /dev/null +++ b/src/modules/DBM/Deep/Sector/File/Index.pm @@ -0,0 +1,98 @@ +package DBM::Deep::Sector::File::Index; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector::File ); + +my $STALE_SIZE = 2; + +# Please refer to the pack() documentation for further information +my %StP = ( + 1 => 'C', # Unsigned char value (no order needed as it's just one byte) + 2 => 'n', # Unsigned short in "network" (big-endian) order + 4 => 'N', # Unsigned long in "network" (big-endian) order + 8 => 'Q', # Usigned quad (no order specified, presumably machine-dependent) +); + +sub _init { + my $self = shift; + + my $engine = $self->engine; + + unless ( $self->offset ) { + my $leftover = $self->size - $self->base_size; + + $self->{offset} = $engine->_request_index_sector( $self->size ); + $engine->storage->print_at( $self->offset, $engine->SIG_INDEX ); # Sector type + # Skip staleness counter + $engine->storage->print_at( $self->offset + $self->base_size, + chr(0) x $leftover, # Zero-fill the rest + ); + } + + return $self; +} + +#XXX Change here +sub size { + my $self = shift; + unless ( $self->{size} ) { + my $e = $self->engine; + $self->{size} = $self->base_size + $e->byte_size * $e->hash_chars; + } + return $self->{size}; +} + +sub free_meth { return '_add_free_index_sector' } + +sub free { + my $self = shift; + my $e = $self->engine; + + for my $i ( 0 .. $e->hash_chars - 1 ) { + my $l = $self->get_entry( $i ) or next; + $e->load_sector( $l )->free; + } + + $self->SUPER::free(); +} + +sub _loc_for { + my $self = shift; + my ($idx) = @_; + return $self->offset + $self->base_size + $idx * $self->engine->byte_size; +} + +sub get_entry { + my $self = shift; + my ($idx) = @_; + + my $e = $self->engine; + + DBM::Deep->_throw_error( "get_entry: Out of range ($idx)" ) + if $idx < 0 || $idx >= $e->hash_chars; + + return unpack( + $StP{$e->byte_size}, + $e->storage->read_at( $self->_loc_for( $idx ), $e->byte_size ), + ); +} + +sub set_entry { + my $self = shift; + my ($idx, $loc) = @_; + + my $e = $self->engine; + + DBM::Deep->_throw_error( "set_entry: Out of range ($idx)" ) + if $idx < 0 || $idx >= $e->hash_chars; + + $self->engine->storage->print_at( + $self->_loc_for( $idx ), + pack( $StP{$e->byte_size}, $loc ), + ); +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/File/Null.pm b/src/modules/DBM/Deep/Sector/File/Null.pm new file mode 100644 index 0000000..ce77ea0 --- /dev/null +++ b/src/modules/DBM/Deep/Sector/File/Null.pm @@ -0,0 +1,46 @@ +package DBM::Deep::Sector::File::Null; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector::File::Data ); + +my $STALE_SIZE = 2; + +# Please refer to the pack() documentation for further information +my %StP = ( + 1 => 'C', # Unsigned char value (no order needed as it's just one byte) + 2 => 'n', # Unsigned short in "network" (big-endian) order + 4 => 'N', # Unsigned long in "network" (big-endian) order + 8 => 'Q', # Usigned quad (no order specified, presumably machine-dependent) +); + +sub type { $_[0]{engine}->SIG_NULL } +sub data_length { 0 } +sub data { return } + +sub _init { + my $self = shift; + + my $engine = $self->engine; + + unless ( $self->offset ) { + my $leftover = $self->size - $self->base_size - 1 * $engine->byte_size - 1; + + $self->{offset} = $engine->_request_data_sector( $self->size ); + $engine->storage->print_at( $self->offset, $self->type ); # Sector type + # Skip staleness counter + $engine->storage->print_at( $self->offset + $self->base_size, + pack( $StP{$engine->byte_size}, 0 ), # Chain loc + pack( $StP{1}, $self->data_length ), # Data length + chr(0) x $leftover, # Zero-fill the rest + ); + + return; + } +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/File/Reference.pm b/src/modules/DBM/Deep/Sector/File/Reference.pm new file mode 100644 index 0000000..6f94ddd --- /dev/null +++ b/src/modules/DBM/Deep/Sector/File/Reference.pm @@ -0,0 +1,564 @@ +package DBM::Deep::Sector::File::Reference; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base qw( DBM::Deep::Sector::File::Data ); + +use Scalar::Util; + +my $STALE_SIZE = 2; + +# Please refer to the pack() documentation for further information +my %StP = ( + 1 => 'C', # Unsigned char value (no order needed as it's just one byte) + 2 => 'n', # Unsigned short in "network" (big-endian) order + 4 => 'N', # Unsigned long in "network" (big-endian) order + 8 => 'Q', # Usigned quad (no order specified, presumably machine-dependent) +); + +sub _init { + my $self = shift; + + my $e = $self->engine; + + unless ( $self->offset ) { + my $classname = Scalar::Util::blessed( delete $self->{data} ); + my $leftover = $self->size - $self->base_size - 3 * $e->byte_size; + + my $class_offset = 0; + if ( defined $classname ) { + my $class_sector = DBM::Deep::Sector::File::Scalar->new({ + engine => $e, + data => $classname, + }); + $class_offset = $class_sector->offset; + } + + $self->{offset} = $e->_request_data_sector( $self->size ); + $e->storage->print_at( $self->offset, $self->type ); # Sector type + # Skip staleness counter + $e->storage->print_at( $self->offset + $self->base_size, + pack( $StP{$e->byte_size}, 0 ), # Index/BList loc + pack( $StP{$e->byte_size}, $class_offset ), # Classname loc + pack( $StP{$e->byte_size}, 1 ), # Initial refcount + chr(0) x $leftover, # Zero-fill the rest + ); + } + else { + $self->{type} = $e->storage->read_at( $self->offset, 1 ); + } + + $self->{staleness} = unpack( + $StP{$STALE_SIZE}, + $e->storage->read_at( $self->offset + $e->SIG_SIZE, $STALE_SIZE ), + ); + + return; +} + +sub get_data_location_for { + my $self = shift; + my ($args) = @_; + + # Assume that the head is not allowed unless otherwise specified. + $args->{allow_head} = 0 unless exists $args->{allow_head}; + + # Assume we don't create a new blist location unless otherwise specified. + $args->{create} = 0 unless exists $args->{create}; + + my $blist = $self->get_bucket_list({ + key_md5 => $args->{key_md5}, + key => $args->{key}, + create => $args->{create}, + }); + return unless $blist && $blist->{found}; + + # At this point, $blist knows where the md5 is. What it -doesn't- know yet + # is whether or not this transaction has this key. That's part of the next + # function call. + my $location = $blist->get_data_location_for({ + allow_head => $args->{allow_head}, + }) or return; + + return $location; +} + +sub get_data_for { + my $self = shift; + my ($args) = @_; + + my $location = $self->get_data_location_for( $args ) + or return; + + return $self->engine->load_sector( $location ); +} + +sub write_data { + my $self = shift; + my ($args) = @_; + + my $blist = $self->get_bucket_list({ + key_md5 => $args->{key_md5}, + key => $args->{key}, + create => 1, + }) or DBM::Deep->_throw_error( "How did write_data fail (no blist)?!" ); + + # Handle any transactional bookkeeping. + if ( $self->engine->trans_id ) { + if ( ! $blist->has_md5 ) { + $blist->mark_deleted({ + trans_id => 0, + }); + } + } + else { + my @trans_ids = $self->engine->get_running_txn_ids; + if ( $blist->has_md5 ) { + if ( @trans_ids ) { + my $old_value = $blist->get_data_for; + foreach my $other_trans_id ( @trans_ids ) { + next if $blist->get_data_location_for({ + trans_id => $other_trans_id, + allow_head => 0, + }); + $blist->write_md5({ + trans_id => $other_trans_id, + key => $args->{key}, + key_md5 => $args->{key_md5}, + value => $old_value->clone, + }); + } + } + } + else { + if ( @trans_ids ) { + foreach my $other_trans_id ( @trans_ids ) { + #XXX This doesn't seem to possible to ever happen . . . + next if $blist->get_data_location_for({ trans_id => $other_trans_id, allow_head => 0 }); + $blist->mark_deleted({ + trans_id => $other_trans_id, + }); + } + } + } + } + + #XXX Is this safe to do transactionally? + # Free the place we're about to write to. + if ( $blist->get_data_location_for({ allow_head => 0 }) ) { + $blist->get_data_for({ allow_head => 0 })->free; + } + + $blist->write_md5({ + key => $args->{key}, + key_md5 => $args->{key_md5}, + value => $args->{value}, + }); +} + +sub delete_key { + my $self = shift; + my ($args) = @_; + + # This can return nothing if we are deleting an entry in a hashref that was + # auto-vivified as part of the delete process. For example: + # my $x = {}; + # delete $x->{foo}{bar}; + my $blist = $self->get_bucket_list({ + key_md5 => $args->{key_md5}, + }) or return; + + # Save the location so that we can free the data + my $location = $blist->get_data_location_for({ + allow_head => 0, + }); + my $old_value = $location && $self->engine->load_sector( $location ); + + my @trans_ids = $self->engine->get_running_txn_ids; + + # If we're the HEAD and there are running txns, then we need to clone this + # value to the other transactions to preserve Isolation. + if ( $self->engine->trans_id == 0 ) { + if ( @trans_ids ) { + foreach my $other_trans_id ( @trans_ids ) { + next if $blist->get_data_location_for({ trans_id => $other_trans_id, allow_head => 0 }); + $blist->write_md5({ + trans_id => $other_trans_id, + key => $args->{key}, + key_md5 => $args->{key_md5}, + value => $old_value->clone, + }); + } + } + } + + my $data; + if ( @trans_ids ) { + $blist->mark_deleted( $args ); + + if ( $old_value ) { + #XXX Is this export => 1 actually doing anything? + $data = $old_value->data({ export => 1 }); + $old_value->free; + } + } + else { + $data = $blist->delete_md5( $args ); + } + + return $data; +} + +sub write_blist_loc { + my $self = shift; + my ($loc) = @_; + + my $engine = $self->engine; + $engine->storage->print_at( $self->offset + $self->base_size, + pack( $StP{$engine->byte_size}, $loc ), + ); +} + +sub get_blist_loc { + my $self = shift; + + my $e = $self->engine; + my $blist_loc = $e->storage->read_at( $self->offset + $self->base_size, $e->byte_size ); + return unpack( $StP{$e->byte_size}, $blist_loc ); +} + +sub get_bucket_list { + my $self = shift; + my ($args) = @_; + $args ||= {}; + + # XXX Add in check here for recycling? + + my $engine = $self->engine; + + my $blist_loc = $self->get_blist_loc; + + # There's no index or blist yet + unless ( $blist_loc ) { + return unless $args->{create}; + + my $blist = DBM::Deep::Sector::File::BucketList->new({ + engine => $engine, + key_md5 => $args->{key_md5}, + }); + + $self->write_blist_loc( $blist->offset ); +# $engine->storage->print_at( $self->offset + $self->base_size, +# pack( $StP{$engine->byte_size}, $blist->offset ), +# ); + + return $blist; + } + + my $sector = $engine->load_sector( $blist_loc ) + or DBM::Deep->_throw_error( "Cannot read sector at $blist_loc in get_bucket_list()" ); + my $i = 0; + my $last_sector = undef; + while ( $sector->isa( 'DBM::Deep::Sector::File::Index' ) ) { + $blist_loc = $sector->get_entry( ord( substr( $args->{key_md5}, $i++, 1 ) ) ); + $last_sector = $sector; + if ( $blist_loc ) { + $sector = $engine->load_sector( $blist_loc ) + or DBM::Deep->_throw_error( "Cannot read sector at $blist_loc in get_bucket_list()" ); + } + else { + $sector = undef; + last; + } + } + + # This means we went through the Index sector(s) and found an empty slot + unless ( $sector ) { + return unless $args->{create}; + + DBM::Deep->_throw_error( "No last_sector when attempting to build a new entry" ) + unless $last_sector; + + my $blist = DBM::Deep::Sector::File::BucketList->new({ + engine => $engine, + key_md5 => $args->{key_md5}, + }); + + $last_sector->set_entry( ord( substr( $args->{key_md5}, $i - 1, 1 ) ) => $blist->offset ); + + return $blist; + } + + $sector->find_md5( $args->{key_md5} ); + + # See whether or not we need to reindex the bucketlist + # Yes, the double-braces are there for a reason. if() doesn't create a + # redo-able block, so we have to create a bare block within the if() for + # redo-purposes. + # Patch and idea submitted by sprout@cpan.org. -RobK, 2008-01-09 + if ( !$sector->has_md5 && $args->{create} && $sector->{idx} == -1 ) {{ + my $redo; + + my $new_index = DBM::Deep::Sector::File::Index->new({ + engine => $engine, + }); + + my %blist_cache; + #XXX q.v. the comments for this function. + foreach my $entry ( $sector->chopped_up ) { + my ($spot, $md5) = @{$entry}; + my $idx = ord( substr( $md5, $i, 1 ) ); + + # XXX This is inefficient + my $blist = $blist_cache{$idx} + ||= DBM::Deep::Sector::File::BucketList->new({ + engine => $engine, + }); + + $new_index->set_entry( $idx => $blist->offset ); + + my $new_spot = $blist->write_at_next_open( $md5 ); + $engine->reindex_entry( $spot => $new_spot ); + } + + # Handle the new item separately. + { + my $idx = ord( substr( $args->{key_md5}, $i, 1 ) ); + + # If all the previous blist's items have been thrown into one + # blist and the new item belongs in there too, we need + # another index. + if ( keys %blist_cache == 1 and each %blist_cache == $idx ) { + ++$i, ++$redo; + } else { + my $blist = $blist_cache{$idx} + ||= DBM::Deep::Sector::File::BucketList->new({ + engine => $engine, + }); + + $new_index->set_entry( $idx => $blist->offset ); + + #XXX THIS IS HACKY! + $blist->find_md5( $args->{key_md5} ); + $blist->write_md5({ + key => $args->{key}, + key_md5 => $args->{key_md5}, + value => DBM::Deep::Sector::File::Null->new({ + engine => $engine, + data => undef, + }), + }); + } + } + + if ( $last_sector ) { + $last_sector->set_entry( + ord( substr( $args->{key_md5}, $i - 1, 1 ) ), + $new_index->offset, + ); + } else { + $engine->storage->print_at( $self->offset + $self->base_size, + pack( $StP{$engine->byte_size}, $new_index->offset ), + ); + } + + $sector->wipe; + $sector->free; + + if ( $redo ) { + (undef, $sector) = %blist_cache; + $last_sector = $new_index; + redo; + } + + $sector = $blist_cache{ ord( substr( $args->{key_md5}, $i, 1 ) ) }; + $sector->find_md5( $args->{key_md5} ); + }} + + return $sector; +} + +sub get_class_offset { + my $self = shift; + + my $e = $self->engine; + return unpack( + $StP{$e->byte_size}, + $e->storage->read_at( + $self->offset + $self->base_size + 1 * $e->byte_size, $e->byte_size, + ), + ); +} + +sub get_classname { + my $self = shift; + + my $class_offset = $self->get_class_offset; + + return unless $class_offset; + + return $self->engine->load_sector( $class_offset )->data; +} + +# Look to hoist this method into a ::Reference trait +sub data { + my $self = shift; + my ($args) = @_; + $args ||= {}; + + my $engine = $self->engine; + my $cache_entry = $engine->cache->{ $self->offset } ||= {}; + my $trans_id = $engine->trans_id; + my $obj; + if ( !defined $$cache_entry{ $trans_id } ) { + $obj = DBM::Deep->new({ + type => $self->type, + base_offset => $self->offset, + staleness => $self->staleness, + storage => $engine->storage, + engine => $engine, + }); + + $$cache_entry{ $trans_id } = $obj; + Scalar::Util::weaken($$cache_entry{ $trans_id }); + } + else { + $obj = $$cache_entry{ $trans_id }; + } + + # We're not exporting, so just return. + unless ( $args->{export} ) { + if ( $engine->storage->{autobless} ) { + my $classname = $self->get_classname; + if ( defined $classname ) { + bless $obj, $classname; + } + } + + return $obj; + } + + # We shouldn't export if this is still referred to. + if ( $self->get_refcount > 1 ) { + return $obj; + } + + return $obj->export; +} + +sub free { + my $self = shift; + + # We're not ready to be removed yet. + return if $self->decrement_refcount > 0; + + my $e = $self->engine; + + # Rebless the object into DBM::Deep::Null. + # In external_refs mode, this will already have been removed from + # the cache, so we can skip this. + if(!$e->{external_refs}) { +# eval { %{ $e->cache->{ $self->offset }{ $e->trans_id } } = (); }; +# eval { @{ $e->cache->{ $self->offset }{ $e->trans_id } } = (); }; + my $cache = $e->cache; + my $off = $self->offset; + if( exists $cache->{ $off } + and exists $cache->{ $off }{ my $trans_id = $e->trans_id } ) { + bless $cache->{ $off }{ $trans_id }, 'DBM::Deep::Null' + if defined $cache->{ $off }{ $trans_id }; + delete $cache->{ $off }{ $trans_id }; + } + } + + my $blist_loc = $self->get_blist_loc; + $e->load_sector( $blist_loc )->free if $blist_loc; + + my $class_loc = $self->get_class_offset; + $e->load_sector( $class_loc )->free if $class_loc; + + $self->SUPER::free(); +} + +sub increment_refcount { + my $self = shift; + + my $refcount = $self->get_refcount; + + $refcount++; + + $self->write_refcount( $refcount ); + + return $refcount; +} + +sub decrement_refcount { + my $self = shift; + + my $refcount = $self->get_refcount; + + $refcount--; + + $self->write_refcount( $refcount ); + + return $refcount; +} + +sub get_refcount { + my $self = shift; + + my $e = $self->engine; + return unpack( + $StP{$e->byte_size}, + $e->storage->read_at( + $self->offset + $self->base_size + 2 * $e->byte_size, $e->byte_size, + ), + ); +} + +sub write_refcount { + my $self = shift; + my ($num) = @_; + + my $e = $self->engine; + $e->storage->print_at( + $self->offset + $self->base_size + 2 * $e->byte_size, + pack( $StP{$e->byte_size}, $num ), + ); +} + +sub clear { + my $self = shift; + + my $blist_loc = $self->get_blist_loc or return; + + my $engine = $self->engine; + + # This won't work with autoblessed items. + if ($engine->get_running_txn_ids) { + # ~~~ Temporary; the code below this block needs to be modified to + # take transactions into account. + $self->data->_get_self->_clear; + return; + } + + my $sector = $engine->load_sector( $blist_loc ) + or DBM::Deep->_throw_error( + "Cannot read sector at $blist_loc in clear()" + ); + + # Set blist offset to 0 + $engine->storage->print_at( $self->offset + $self->base_size, + pack( $StP{$engine->byte_size}, 0 ), + ); + + # Free the blist + $sector->free; + + return; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Sector/File/Scalar.pm b/src/modules/DBM/Deep/Sector/File/Scalar.pm new file mode 100644 index 0000000..cd4d793 --- /dev/null +++ b/src/modules/DBM/Deep/Sector/File/Scalar.pm @@ -0,0 +1,143 @@ +package DBM::Deep::Sector::File::Scalar; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; +no warnings 'recursion'; + +use base qw( DBM::Deep::Sector::File::Data ); + +my $STALE_SIZE = 2; + +# Please refer to the pack() documentation for further information +my %StP = ( + 1 => 'C', # Unsigned char value (no order needed as it's just one byte) + 2 => 'n', # Unsigned short in "network" (big-endian) order + 4 => 'N', # Unsigned long in "network" (big-endian) order + 8 => 'Q', # Usigned quad (no order specified, presumably machine-dependent) +); + +sub free { + my $self = shift; + + my $chain_loc = $self->chain_loc; + + $self->SUPER::free(); + + if ( $chain_loc ) { + $self->engine->load_sector( $chain_loc )->free; + } + + return; +} + +sub _init { + my $self = shift; + + my $engine = $self->engine; + + unless ( $self->offset ) { + my $data_section = $self->size - $self->base_size - $engine->byte_size - 1; + + $self->{offset} = $engine->_request_data_sector( $self->size ); + + my $data = delete $self->{data}; + my $utf8 = do { no warnings 'utf8'; $data !~ /^[\0-\xff]*\z/ }; + if($utf8){ + if($engine->{v} < 4) { + DBM::Deep->_throw_error( + "This database format version is too old for Unicode" + ); + } + utf8::encode $data; + $self->{type} = $engine->SIG_UNIDATA; + } + else { $self->{type} = $engine->SIG_DATA; } + + my $dlen = length $data; + my $continue = 1; + my $curr_offset = $self->offset; + while ( $continue ) { + + my $next_offset = 0; + + my ($leftover, $this_len, $chunk); + if ( $dlen > $data_section ) { + $leftover = 0; + $this_len = $data_section; + $chunk = substr( $data, 0, $this_len ); + + $dlen -= $data_section; + $next_offset = $engine->_request_data_sector( $self->size ); + $data = substr( $data, $this_len ); + } + else { + $leftover = $data_section - $dlen; + $this_len = $dlen; + $chunk = $data; + + $continue = 0; + } + + $engine->storage->print_at( $curr_offset, $self->type ); # Sector type + # Skip staleness + $engine->storage->print_at( $curr_offset + $self->base_size, + pack( $StP{$engine->byte_size}, $next_offset ), # Chain loc + pack( $StP{1}, $this_len ), # Data length + $chunk, # Data to be stored in this sector + chr(0) x $leftover, # Zero-fill the rest + ); + + $curr_offset = $next_offset; + } + + return; + } +} + +sub data_length { + my $self = shift; + + my $buffer = $self->engine->storage->read_at( + $self->offset + $self->base_size + $self->engine->byte_size, 1 + ); + + return unpack( $StP{1}, $buffer ); +} + +sub chain_loc { + my $self = shift; + return unpack( + $StP{$self->engine->byte_size}, + $self->engine->storage->read_at( + $self->offset + $self->base_size, + $self->engine->byte_size, + ), + ); +} + +sub data { + my $self = shift; + my $engine = $self->engine; + + my $data; + while ( 1 ) { + my $chain_loc = $self->chain_loc; + + $data .= $engine->storage->read_at( + $self->offset + $self->base_size + $engine->byte_size + 1, $self->data_length, + ); + + last unless $chain_loc; + + $self = $engine->load_sector( $chain_loc ); + } + + utf8::decode $data if $self->type eq $engine->SIG_UNIDATA; + + return $data; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Storage.pm b/src/modules/DBM/Deep/Storage.pm new file mode 100644 index 0000000..ec6b0ac --- /dev/null +++ b/src/modules/DBM/Deep/Storage.pm @@ -0,0 +1,70 @@ +package DBM::Deep::Storage; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +=head1 NAME + +DBM::Deep::Storage - abstract base class for storage + +=head2 flush() + +This flushes the filehandle. This takes no parameters and returns nothing. + +=cut + +sub flush { die "flush must be implemented in a child class" } + +=head2 is_writable() + +This takes no parameters. It returns a boolean saying if this filehandle is +writable. + +Taken from L. + +=cut + +sub is_writable { die "is_writable must be implemented in a child class" } + +=head1 LOCKING + +This is where the actual locking of the storage medium is performed. +Nested locking is supported. + +B: It is unclear what will happen if a read lock is taken, then +a write lock is taken as a nested lock, then the write lock is released. + +Currently, the only locking method supported is flock(1). This is a +whole-file lock. In the future, more granular locking may be supported. +The API for that is unclear right now. + +The following methods manage the locking status. In all cases, they take +a L object and returns nothing. + +=over 4 + +=item * lock_exclusive( $obj ) + +Take a lock usable for writing. + +=item * lock_shared( $obj ) + +Take a lock usable for reading. + +=item * unlock( $obj ) + +Releases the last lock taken. If this is the outermost lock, then the +object is actually unlocked. + +=back + +=cut + +sub lock_exclusive { die "lock_exclusive must be implemented in a child class" } +sub lock_shared { die "lock_shared must be implemented in a child class" } +sub unlock { die "unlock must be implemented in a child class" } + +1; +__END__ diff --git a/src/modules/DBM/Deep/Storage/DBI.pm b/src/modules/DBM/Deep/Storage/DBI.pm new file mode 100644 index 0000000..26e55e7 --- /dev/null +++ b/src/modules/DBM/Deep/Storage/DBI.pm @@ -0,0 +1,170 @@ +package DBM::Deep::Storage::DBI; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use base 'DBM::Deep::Storage'; + +use DBI; + +sub new { + my $class = shift; + my ($args) = @_; + + my $self = bless { + autobless => 1, + dbh => undef, + dbi => undef, + }, $class; + + # Grab the parameters we want to use + foreach my $param ( keys %$self ) { + next unless exists $args->{$param}; + $self->{$param} = $args->{$param}; + } + + if ( $self->{dbh} ) { + $self->{driver} = lc $self->{dbh}->{Driver}->{Name}; + } + else { + $self->open; + } + + # Foreign keys are turned off by default in SQLite3 (for now) + #q.v. http://search.cpan.org/~adamk/DBD-SQLite-1.27/lib/DBD/SQLite.pm#Foreign_Keys + # for more info. + if ( $self->driver eq 'sqlite' ) { + $self->{dbh}->do( 'PRAGMA foreign_keys = ON' ); + } + + return $self; +} + +sub open { + my $self = shift; + + return if $self->{dbh}; + + $self->{dbh} = DBI->connect( + $self->{dbi}{dsn}, $self->{dbi}{username}, $self->{dbi}{password}, { + AutoCommit => 1, + PrintError => 0, + RaiseError => 1, + %{ $self->{dbi}{connect_args} || {} }, + }, + ) or die $DBI::error; + + # Should we use the same method as done in new() if passed a $dbh? + (undef, $self->{driver}) = map defined($_) ? lc($_) : undef, DBI->parse_dsn( $self->{dbi}{dsn} ); + + return 1; +} + +sub close { + my $self = shift; + $self->{dbh}->disconnect if $self->{dbh}; + return 1; +} + +sub DESTROY { + my $self = shift; + $self->close if ref $self; +} + +# Is there a portable way of determining writability to a DBH? +sub is_writable { + my $self = shift; + return 1; +} + +sub lock_exclusive { + my $self = shift; +} + +sub lock_shared { + my $self = shift; +} + +sub unlock { + my $self = shift; +# $self->{dbh}->commit; +} + +#sub begin_work { +# my $self = shift; +# $self->{dbh}->begin_work; +#} +# +#sub commit { +# my $self = shift; +# $self->{dbh}->commit; +#} +# +#sub rollback { +# my $self = shift; +# $self->{dbh}->rollback; +#} + +sub read_from { + my $self = shift; + my ($table, $cond, @cols) = @_; + + $cond = { id => $cond } unless ref $cond; + + my @keys = keys %$cond; + my $where = join ' AND ', map { "`$_` = ?" } @keys; + + return $self->{dbh}->selectall_arrayref( + "SELECT `@{[join '`,`', @cols ]}` FROM $table WHERE $where", + { Slice => {} }, @{$cond}{@keys}, + ); +} + +sub flush {} + +sub write_to { + my $self = shift; + my ($table, $id, %args) = @_; + + my @keys = keys %args; + my $sql = + "REPLACE INTO $table ( `id`, " + . join( ',', map { "`$_`" } @keys ) + . ") VALUES (" + . join( ',', ('?') x (@keys + 1) ) + . ")"; + $self->{dbh}->do( $sql, undef, $id, @args{@keys} ); + + return $self->{dbh}->last_insert_id("", "", "", ""); +} + +sub delete_from { + my $self = shift; + my ($table, $cond) = @_; + + $cond = { id => $cond } unless ref $cond; + + my @keys = keys %$cond; + my $where = join ' AND ', map { "`$_` = ?" } @keys; + + $self->{dbh}->do( + "DELETE FROM $table WHERE $where", undef, @{$cond}{@keys}, + ); +} + +sub driver { $_[0]{driver} } + +sub rand_function { + my $self = shift; + my $driver = $self->driver; + + $driver eq 'sqlite' and return 'random()'; + $driver eq 'mysql' and return 'RAND()'; + + die "rand_function undefined for $driver\n"; +} + +1; +__END__ diff --git a/src/modules/DBM/Deep/Storage/File.pm b/src/modules/DBM/Deep/Storage/File.pm new file mode 100644 index 0000000..38eaef3 --- /dev/null +++ b/src/modules/DBM/Deep/Storage/File.pm @@ -0,0 +1,399 @@ +package DBM::Deep::Storage::File; + +use 5.008_004; + +use strict; +use warnings FATAL => 'all'; + +use Fcntl qw( :DEFAULT :flock :seek ); + +use constant DEBUG => 0; + +use base 'DBM::Deep::Storage'; + +=head1 NAME + +DBM::Deep::Storage::File - mediate low-level interaction with storage mechanism + +=head1 PURPOSE + +This is an internal-use-only object for L. It mediates the low-level +interaction with the storage mechanism. + +Currently, the only storage mechanism supported is the file system. + +=head1 OVERVIEW + +This class provides an abstraction to the storage mechanism so that the Engine +(the only class that uses this class) doesn't have to worry about that. + +=head1 METHODS + +=head2 new( \%args ) + +=cut + +sub new { + my $class = shift; + my ($args) = @_; + + my $self = bless { + autobless => 1, + autoflush => 1, + end => 0, + fh => undef, + file => undef, + file_offset => 0, + locking => 1, + locked => 0, +#XXX Migrate this to the engine, where it really belongs. + filter_store_key => undef, + filter_store_value => undef, + filter_fetch_key => undef, + filter_fetch_value => undef, + }, $class; + + # Grab the parameters we want to use + foreach my $param ( keys %$self ) { + next unless exists $args->{$param}; + $self->{$param} = $args->{$param}; + } + + if ( $self->{fh} && !$self->{file_offset} ) { + $self->{file_offset} = tell( $self->{fh} ); + } + + $self->open unless $self->{fh}; + + return $self; +} + +=head2 open() + +This method opens the filehandle for the filename in C< file >. + +There is no return value. + +=cut + +# TODO: What happens if we ->open when we already have a $fh? +sub open { + my $self = shift; + + # Adding O_BINARY should remove the need for the binmode below. However, + # I'm not going to remove it because I don't have the Win32 chops to be + # absolutely certain everything will be ok. + my $flags = O_CREAT | O_BINARY; + + if ( !-e $self->{file} || -w _ ) { + $flags |= O_RDWR; + } + else { + $flags |= O_RDONLY; + } + + my $fh; + sysopen( $fh, $self->{file}, $flags ) + or die "DBM::Deep: Cannot sysopen file '$self->{file}': $!\n"; + $self->{fh} = $fh; + + # Even though we use O_BINARY, better be safe than sorry. + binmode $fh; + + if ($self->{autoflush}) { + my $old = select $fh; + $|=1; + select $old; + } + + return 1; +} + +=head2 close() + +If the filehandle is opened, this will close it. + +There is no return value. + +=cut + +sub close { + my $self = shift; + + if ( $self->{fh} ) { + close $self->{fh}; + $self->{fh} = undef; + } + + return 1; +} + +=head2 size() + +This will return the size of the DB. If file_offset is set, this will take that into account. + +B: This function isn't used internally anywhere. + +=cut + +sub size { + my $self = shift; + + return 0 unless $self->{fh}; + return( (-s $self->{fh}) - $self->{file_offset} ); +} + +=head2 set_inode() + +This will set the inode value of the underlying file object. + +This is only needed to handle some obscure Win32 bugs. It really shouldn't be +needed outside this object. + +There is no return value. + +=cut + +sub set_inode { + my $self = shift; + + unless ( defined $self->{inode} ) { + my @stats = stat($self->{fh}); + $self->{inode} = $stats[1]; + $self->{end} = $stats[7]; + } + + return 1; +} + +=head2 print_at( $offset, @data ) + +This takes an optional offset and some data to print. + +C< $offset >, if defined, will be used to seek into the file. If file_offset is +set, it will be used as the zero location. If it is undefined, no seeking will +occur. Then, C< @data > will be printed to the current location. + +There is no return value. + +=cut + +sub print_at { + my $self = shift; + my $loc = shift; + + local ($,,$\); + + my $fh = $self->{fh}; + if ( defined $loc ) { + seek( $fh, $loc + $self->{file_offset}, SEEK_SET ); + } + + if ( DEBUG ) { + my $caller = join ':', (caller)[0,2]; + my $len = length( join '', @_ ); + warn "($caller) print_at( " . (defined $loc ? $loc : '') . ", $len )\n"; + } + + print( $fh @_ ) or die "Internal Error (print_at($loc)): $!\n"; + + return 1; +} + +=head2 read_at( $offset, $length ) + +This takes an optional offset and a length. + +C< $offset >, if defined, will be used to seek into the file. If file_offset is +set, it will be used as the zero location. If it is undefined, no seeking will +occur. Then, C< $length > bytes will be read from the current location. + +The data read will be returned. + +=cut + +sub read_at { + my $self = shift; + my ($loc, $size) = @_; + + my $fh = $self->{fh}; + if ( defined $loc ) { + seek( $fh, $loc + $self->{file_offset}, SEEK_SET ); + } + + if ( DEBUG ) { + my $caller = join ':', (caller)[0,2]; + warn "($caller) read_at( " . (defined $loc ? $loc : '') . ", $size )\n"; + } + + my $buffer; + read( $fh, $buffer, $size); + + return $buffer; +} + +=head2 DESTROY + +When the ::Storage::File object goes out of scope, it will be closed. + +=cut + +sub DESTROY { + my $self = shift; + return unless $self; + + $self->close; + + return; +} + +=head2 request_space( $size ) + +This takes a size and adds that much space to the DBM. + +This returns the offset for the new location. + +=cut + +sub request_space { + my $self = shift; + my ($size) = @_; + + #XXX Do I need to reset $self->{end} here? I need a testcase + my $loc = $self->{end}; + $self->{end} += $size; + + return $loc; +} + +=head2 copy_stats( $target_filename ) + +This will take the stats for the current filehandle and apply them to +C< $target_filename >. The stats copied are: + +=over 4 + +=item * Onwer UID and GID + +=item * Permissions + +=back + +=cut + +sub copy_stats { + my $self = shift; + my ($temp_filename) = @_; + + my @stats = stat( $self->{fh} ); + my $perms = $stats[2] & 07777; + my $uid = $stats[4]; + my $gid = $stats[5]; + chown( $uid, $gid, $temp_filename ); + chmod( $perms, $temp_filename ); +} + +sub flush { + my $self = shift; + + # Flush the filehandle + my $old_fh = select $self->{fh}; + my $old_af = $|; $| = 1; $| = $old_af; + select $old_fh; + + return 1; +} + +sub is_writable { + my $self = shift; + + my $fh = $self->{fh}; + return unless defined $fh; + return unless defined fileno $fh; + local $\ = ''; # just in case + no warnings; # temporarily disable warnings + local $^W; # temporarily disable warnings + return print $fh ''; +} + +sub lock_exclusive { + my $self = shift; + my ($obj) = @_; + return $self->_lock( $obj, LOCK_EX ); +} + +sub lock_shared { + my $self = shift; + my ($obj) = @_; + return $self->_lock( $obj, LOCK_SH ); +} + +sub _lock { + my $self = shift; + my ($obj, $type) = @_; + + $type = LOCK_EX unless defined $type; + + #XXX This is a temporary fix for Win32 and autovivification. It + # needs to improve somehow. -RobK, 2008-03-09 + if ( $^O eq 'MSWin32' || $^O eq 'cygwin' ) { + $type = LOCK_EX; + } + + if (!defined($self->{fh})) { return; } + + #XXX This either needs to allow for upgrading a shared lock to an + # exclusive lock or something else with autovivification. + # -RobK, 2008-03-09 + if ($self->{locking}) { + if (!$self->{locked}) { + flock($self->{fh}, $type); + + # refresh end counter in case file has changed size + my @stats = stat($self->{fh}); + $self->{end} = $stats[7]; + + # double-check file inode, in case another process + # has optimize()d our file while we were waiting. + if (defined($self->{inode}) && $stats[1] != $self->{inode}) { + $self->close; + $self->open; + + #XXX This needs work + $obj->{engine}->setup( $obj ); + + flock($self->{fh}, $type); # re-lock + + # This may not be necessary after re-opening + $self->{end} = (stat($self->{fh}))[7]; # re-end + } + } + $self->{locked}++; + + return 1; + } + + return; +} + +sub unlock { + my $self = shift; + + if (!defined($self->{fh})) { return; } + + if ($self->{locking} && $self->{locked} > 0) { + $self->{locked}--; + + if (!$self->{locked}) { + flock($self->{fh}, LOCK_UN); + return 1; + } + + return; + } + + return; +} + +1; +__END__ diff --git a/src/modules/Exporter/Shiny.pm b/src/modules/Exporter/Shiny.pm new file mode 100644 index 0000000..7b7ae07 --- /dev/null +++ b/src/modules/Exporter/Shiny.pm @@ -0,0 +1,116 @@ +package Exporter::Shiny; + +use 5.006001; +use strict; +use warnings; + +use Exporter::Tiny (); + +our $AUTHORITY = 'cpan:TOBYINK'; +our $VERSION = '1.004004'; + +sub import { + my $me = shift; + my $caller = caller; + + (my $nominal_file = $caller) =~ s(::)(/)g; + $INC{"$nominal_file\.pm"} ||= __FILE__; + + if (@_ == 2 and $_[0] eq -setup) + { + my (undef, $opts) = @_; + @_ = @{ delete($opts->{exports}) || [] }; + + if (%$opts) { + Exporter::Tiny::_croak( + 'Unsupported Sub::Exporter-style options: %s', + join(q[, ], sort keys %$opts), + ); + } + } + + ref($_) && Exporter::Tiny::_croak('Expected sub name, got ref %s', $_) for @_; + + no strict qw(refs); + push @{"$caller\::ISA"}, 'Exporter::Tiny'; + push @{"$caller\::EXPORT_OK"}, @_; +} + +1; + +__END__ + +=pod + +=encoding utf-8 + +=head1 NAME + +Exporter::Shiny - shortcut for Exporter::Tiny + +=head1 SYNOPSIS + + use Exporter::Shiny qw( foo bar ); + +Is a shortcut for: + + use base "Exporter::Tiny"; + push our(@EXPORT_OK), qw( foo bar ); + +For compatibility with L, the following longer syntax is +also supported: + + use Exporter::Shiny -setup => { + exports => [qw( foo bar )], + }; + +=head1 DESCRIPTION + +This is a very small wrapper to simplify using L. + +It does the following: + +=over + +=item * Marks your package as loaded in C<< %INC >>; + +=item * Pushes any function names in the import list onto your C<< @EXPORT_OK >>; and + +=item * Pushes C<< "Exporter::Tiny" >> onto your C<< @ISA >>. + +=back + +It doesn't set up C<< %EXPORT_TAGS >> or C<< @EXPORT >>, but there's +nothing stopping you doing that yourself. + +=head1 BUGS + +Please report any bugs to +L. + +=head1 SEE ALSO + +This module is just a wrapper around L, so take a look +at L and +L for further information on what +features are available. + +Other interesting exporters: L, L. + +=head1 AUTHOR + +Toby Inkster Etobyink@cpan.orgE. + +=head1 COPYRIGHT AND LICENCE + +This software is copyright (c) 2014, 2017 by Toby Inkster. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=head1 DISCLAIMER OF WARRANTIES + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + diff --git a/src/modules/Exporter/Tiny.pm b/src/modules/Exporter/Tiny.pm new file mode 100644 index 0000000..3879a40 --- /dev/null +++ b/src/modules/Exporter/Tiny.pm @@ -0,0 +1,525 @@ +package Exporter::Tiny; + +use 5.006001; +use strict; +use warnings; no warnings qw(void once uninitialized numeric redefine); + +our $AUTHORITY = 'cpan:TOBYINK'; +our $VERSION = '1.004004'; +our @EXPORT_OK = qw< mkopt mkopt_hash _croak _carp >; + +sub _croak ($;@) { require Carp; my $fmt = shift; @_ = sprintf($fmt, @_); goto \&Carp::croak } +sub _carp ($;@) { require Carp; my $fmt = shift; @_ = sprintf($fmt, @_); goto \&Carp::carp } + +my $_process_optlist = sub +{ + my $class = shift; + my ($global_opts, $opts, $want, $not_want) = @_; + + while (@$opts) + { + my $opt = shift @{$opts}; + my ($name, $value) = @$opt; + + ($name =~ m{\A\!(/.+/[msixpodual]*)\z}) ? + do { + my @not = $class->_exporter_expand_regexp("$1", $value, $global_opts); + ++$not_want->{$_->[0]} for @not; + } : + ($name =~ m{\A\![:-](.+)\z}) ? + do { + my @not = $class->_exporter_expand_tag("$1", $value, $global_opts); + ++$not_want->{$_->[0]} for @not; + } : + ($name =~ m{\A\!(.+)\z}) ? + (++$not_want->{$1}) : + ($name =~ m{\A[:-](.+)\z}) ? + push(@$opts, $class->_exporter_expand_tag("$1", $value, $global_opts)) : + ($name =~ m{\A/.+/[msixpodual]*\z}) ? + push(@$opts, $class->_exporter_expand_regexp($name, $value, $global_opts)) : + # else ? + push(@$want, $opt); + } +}; + +sub import +{ + my $class = shift; + my $global_opts = +{ @_ && ref($_[0]) eq q(HASH) ? %{+shift} : () }; + $global_opts->{into} = caller unless exists $global_opts->{into}; + + my @want; + my %not_want; $global_opts->{not} = \%not_want; + my @args = do { no strict qw(refs); @_ ? @_ : @{"$class\::EXPORT"} }; + my $opts = mkopt(\@args); + $class->$_process_optlist($global_opts, $opts, \@want, \%not_want); + + my $permitted = $class->_exporter_permitted_regexp($global_opts); + $class->_exporter_validate_opts($global_opts); + + for my $wanted (@want) + { + next if $not_want{$wanted->[0]}; + + my %symbols = $class->_exporter_expand_sub(@$wanted, $global_opts, $permitted); + $class->_exporter_install_sub($_, $wanted->[1], $global_opts, $symbols{$_}) + for keys %symbols; + } +} + +sub unimport +{ + my $class = shift; + my $global_opts = +{ @_ && ref($_[0]) eq q(HASH) ? %{+shift} : () }; + $global_opts->{into} = caller unless exists $global_opts->{into}; + $global_opts->{is_unimport} = 1; + + my @want; + my %not_want; $global_opts->{not} = \%not_want; + my @args = do { our %TRACKED; @_ ? @_ : keys(%{$TRACKED{$class}{$global_opts->{into}}}) }; + my $opts = mkopt(\@args); + $class->$_process_optlist($global_opts, $opts, \@want, \%not_want); + + my $permitted = $class->_exporter_permitted_regexp($global_opts); + $class->_exporter_validate_unimport_opts($global_opts); + + my $expando = $class->can('_exporter_expand_sub'); + $expando = undef if $expando == \&_exporter_expand_sub; + + for my $wanted (@want) + { + next if $not_want{$wanted->[0]}; + + if ($wanted->[1]) + { + _carp("Passing options to unimport '%s' makes no sense", $wanted->[0]) + unless (ref($wanted->[1]) eq 'HASH' and not keys %{$wanted->[1]}); + } + + my %symbols = defined($expando) + ? $class->$expando(@$wanted, $global_opts, $permitted) + : ($wanted->[0] => sub { "dummy" }); + $class->_exporter_uninstall_sub($_, $wanted->[1], $global_opts) + for keys %symbols; + } +} + +# Called once per import/unimport, passed the "global" import options. +# Expected to validate the options and carp or croak if there are problems. +# Can also take the opportunity to do other stuff if needed. +# +sub _exporter_validate_opts { 1 } +sub _exporter_validate_unimport_opts { 1 } + +# Called after expanding a tag or regexp to merge the tag's options with +# any sub-specific options. +# +sub _exporter_merge_opts +{ + my $class = shift; + my ($tag_opts, $global_opts, @stuff) = @_; + + $tag_opts = {} unless ref($tag_opts) eq q(HASH); + _croak('Cannot provide an -as option for tags') + if exists $tag_opts->{-as} && ref $tag_opts->{-as} ne 'CODE'; + + my $optlist = mkopt(\@stuff); + for my $export (@$optlist) + { + next if defined($export->[1]) && ref($export->[1]) ne q(HASH); + + my %sub_opts = ( %{ $export->[1] or {} }, %$tag_opts ); + $sub_opts{-prefix} = sprintf('%s%s', $tag_opts->{-prefix}, $export->[1]{-prefix}) + if exists($export->[1]{-prefix}) && exists($tag_opts->{-prefix}); + $sub_opts{-suffix} = sprintf('%s%s', $export->[1]{-suffix}, $tag_opts->{-suffix}) + if exists($export->[1]{-suffix}) && exists($tag_opts->{-suffix}); + $export->[1] = \%sub_opts; + } + return @$optlist; +} + +# Given a tag name, looks it up in %EXPORT_TAGS and returns the list of +# associated functions. The default implementation magically handles tags +# "all" and "default". The default implementation interprets any undefined +# tags as being global options. +# +sub _exporter_expand_tag +{ + no strict qw(refs); + + my $class = shift; + my ($name, $value, $globals) = @_; + my $tags = \%{"$class\::EXPORT_TAGS"}; + + return $class->_exporter_merge_opts($value, $globals, $tags->{$name}->($class, @_)) + if ref($tags->{$name}) eq q(CODE); + + return $class->_exporter_merge_opts($value, $globals, @{$tags->{$name}}) + if exists $tags->{$name}; + + return $class->_exporter_merge_opts($value, $globals, @{"$class\::EXPORT"}, @{"$class\::EXPORT_OK"}) + if $name eq 'all'; + + return $class->_exporter_merge_opts($value, $globals, @{"$class\::EXPORT"}) + if $name eq 'default'; + + $globals->{$name} = $value || 1; + return; +} + +# Given a regexp-like string, looks it up in @EXPORT_OK and returns the +# list of matching functions. +# +sub _exporter_expand_regexp +{ + no strict qw(refs); + our %TRACKED; + + my $class = shift; + my ($name, $value, $globals) = @_; + my $compiled = eval("qr$name"); + + my @possible = $globals->{is_unimport} + ? keys( %{$TRACKED{$class}{$globals->{into}}} ) + : @{"$class\::EXPORT_OK"}; + + $class->_exporter_merge_opts($value, $globals, grep /$compiled/, @possible); +} + +# Helper for _exporter_expand_sub. Returns a regexp matching all subs in +# the exporter package which are available for export. +# +sub _exporter_permitted_regexp +{ + no strict qw(refs); + my $class = shift; + my $re = join "|", map quotemeta, sort { + length($b) <=> length($a) or $a cmp $b + } @{"$class\::EXPORT"}, @{"$class\::EXPORT_OK"}; + qr{^(?:$re)$}ms; +} + +# Given a sub name, returns a hash of subs to install (usually just one sub). +# Keys are sub names, values are coderefs. +# +sub _exporter_expand_sub +{ + my $class = shift; + my ($name, $value, $globals, $permitted) = @_; + $permitted ||= $class->_exporter_permitted_regexp($globals); + + no strict qw(refs); + + my $sigil = "&"; + if ($name =~ /\A([&\$\%\@\*])(.+)\z/) { + $sigil = $1; + $name = $2; + if ($sigil eq '*') { + _croak("Cannot export symbols with a * sigil"); + } + } + my $sigilname = $sigil eq '&' ? $name : "$sigil$name"; + + if ($sigilname =~ $permitted) + { + my $generatorprefix = { + '&' => "_generate_", + '$' => "_generateScalar_", + '@' => "_generateArray_", + '%' => "_generateHash_", + }->{$sigil}; + + my $generator = $class->can("$generatorprefix$name"); + return $sigilname => $class->$generator($sigilname, $value, $globals) if $generator; + + if ($sigil eq '&') { + my $sub = $class->can($name); + return $sigilname => $sub if $sub; + } + else { + # Could do this more cleverly, but this works. + my $evalled = eval "\\${sigil}${class}::${name}"; + return $sigilname => $evalled if $evalled; + } + } + + $class->_exporter_fail(@_); +} + +# Called by _exporter_expand_sub if it is unable to generate a key-value +# pair for a sub. +# +sub _exporter_fail +{ + my $class = shift; + my ($name, $value, $globals) = @_; + return if $globals->{is_unimport}; + _croak("Could not find sub '%s' exported by %s", $name, $class); +} + +# Actually performs the installation of the sub into the target package. This +# also handles renaming the sub. +# +sub _exporter_install_sub +{ + my $class = shift; + my ($name, $value, $globals, $sym) = @_; + my $value_hash = ( ref($value) eq 'HASH' ) ? $value : {}; + + my $into = $globals->{into}; + my $installer = $globals->{installer} || $globals->{exporter}; + + if ( $into eq '-lexical' or $globals->{lexical} ) { + $] ge '5.037002' + or _croak( 'Lexical export requires Perl 5.37.2 or above' ); + $installer ||= sub { + my ( $sigilname, $sym ) = @{ $_[1] }; + no warnings ( $] ge '5.037002' ? 'experimental::builtin' : () ); + builtin::export_lexically( $sigilname, $sym ); + }; + } + + $name = + ref $globals->{as} ? $globals->{as}->($name) : + ref $value_hash->{-as} ? $value_hash->{-as}->($name) : + exists $value_hash->{-as} ? $value_hash->{-as} : + $name; + + return unless defined $name; + + my $sigil = "&"; + unless (ref($name)) { + if ($name =~ /\A([&\$\%\@\*])(.+)\z/) { + $sigil = $1; + $name = $2; + if ($sigil eq '*') { + _croak("Cannot export symbols with a * sigil"); + } + } + my ($prefix) = grep defined, $value_hash->{-prefix}, $globals->{prefix}, q(); + my ($suffix) = grep defined, $value_hash->{-suffix}, $globals->{suffix}, q(); + $name = "$prefix$name$suffix"; + } + + my $sigilname = $sigil eq '&' ? $name : "$sigil$name"; + +# if ({qw/$ SCALAR @ ARRAY % HASH & CODE/}->{$sigil} ne ref($sym)) { +# warn $sym; +# warn $sigilname; +# _croak("Reference type %s does not match sigil %s", ref($sym), $sigil); +# } + + return ($$name = $sym) if ref($name) eq q(SCALAR); + return ($into->{$sigilname} = $sym) if ref($into) eq q(HASH); + + no strict qw(refs); + our %TRACKED; + + if (ref($sym) eq 'CODE' and exists &{"$into\::$name"} and \&{"$into\::$name"} != $sym) + { + my ($level) = grep defined, $value_hash->{-replace}, $globals->{replace}, q(0); + my $action = { + carp => \&_carp, + 0 => \&_carp, + '' => \&_carp, + warn => \&_carp, + nonfatal => \&_carp, + croak => \&_croak, + fatal => \&_croak, + die => \&_croak, + }->{$level} || sub {}; + + # Don't complain about double-installing the same sub. This isn't ideal + # because the same named sub might be generated in two different ways. + $action = sub {} if $TRACKED{$class}{$into}{$sigilname}; + + $action->( + $action == \&_croak + ? "Refusing to overwrite existing sub '%s::%s' with sub '%s' exported by %s" + : "Overwriting existing sub '%s::%s' with sub '%s' exported by %s", + $into, + $name, + $_[0], + $class, + ); + } + + $TRACKED{$class}{$into}{$sigilname} = $sym; + + no warnings qw(prototype); + $installer + ? $installer->($globals, [$sigilname, $sym]) + : (*{"$into\::$name"} = $sym); +} + +sub _exporter_uninstall_sub +{ + our %TRACKED; + my $class = shift; + my ($name, $value, $globals, $sym) = @_; + my $into = $globals->{into}; + ref $into and return; + + no strict qw(refs); + + my $sigil = "&"; + if ($name =~ /\A([&\$\%\@\*])(.+)\z/) { + $sigil = $1; + $name = $2; + if ($sigil eq '*') { + _croak("Cannot export symbols with a * sigil"); + } + } + my $sigilname = $sigil eq '&' ? $name : "$sigil$name"; + + if ($sigil ne '&') { + _croak("Unimporting non-code symbols not supported yet"); + } + + # Cowardly refuse to uninstall a sub that differs from the one + # we installed! + my $our_coderef = $TRACKED{$class}{$into}{$name}; + my $cur_coderef = exists(&{"$into\::$name"}) ? \&{"$into\::$name"} : -1; + return unless $our_coderef == $cur_coderef; + + my $stash = \%{"$into\::"}; + my $old = delete $stash->{$name}; + my $full_name = join('::', $into, $name); + foreach my $type (qw(SCALAR HASH ARRAY IO)) # everything but the CODE + { + next unless defined(*{$old}{$type}); + *$full_name = *{$old}{$type}; + } + + delete $TRACKED{$class}{$into}{$name}; +} + +sub mkopt +{ + my $in = shift or return []; + my @out; + + $in = [map(($_ => ref($in->{$_}) ? $in->{$_} : ()), sort keys %$in)] + if ref($in) eq q(HASH); + + for (my $i = 0; $i < @$in; $i++) + { + my $k = $in->[$i]; + my $v; + + ($i == $#$in) ? ($v = undef) : + !defined($in->[$i+1]) ? (++$i, ($v = undef)) : + !ref($in->[$i+1]) ? ($v = undef) : + ($v = $in->[++$i]); + + push @out, [ $k => $v ]; + } + + \@out; +} + +sub mkopt_hash +{ + my $in = shift or return; + my %out = map +($_->[0] => $_->[1]), @{ mkopt($in) }; + \%out; +} + +1; + +__END__ + +=pod + +=encoding utf-8 + +=for stopwords frobnicate greps regexps + +=head1 NAME + +Exporter::Tiny - an exporter with the features of Sub::Exporter but only core dependencies + +=head1 SYNOPSIS + + package MyUtils; + use base "Exporter::Tiny"; + our @EXPORT = qw(frobnicate); + sub frobnicate { ... } + 1; + + package MyScript; + use MyUtils "frobnicate" => { -as => "frob" }; + print frob(42); + exit; + +=head1 DESCRIPTION + +Exporter::Tiny supports many of Sub::Exporter's external-facing features +including renaming imported functions with the C<< -as >>, C<< -prefix >> and +C<< -suffix >> options; explicit destinations with the C<< into >> option; +and alternative installers with the C<< installer >> option. But it's written +in only about 40% as many lines of code and with zero non-core dependencies. + +Its internal-facing interface is closer to Exporter.pm, with configuration +done through the C<< @EXPORT >>, C<< @EXPORT_OK >> and C<< %EXPORT_TAGS >> +package variables. + +If you are trying to B a module that inherits from Exporter::Tiny, +then look at: + +=over + +=item * + +L + +=item * + +L + +=back + +If you are trying to B a module that inherits from Exporter::Tiny, +then look at: + +=over + +=item * + +L + +=back + +=head1 BUGS + +Please report any bugs to +L. + +=head1 SUPPORT + +B<< IRC: >> support is available through in the I<< #moops >> channel +on L. + +=head1 SEE ALSO + +Simplified interface to this module: L. + +Other interesting exporters: L, L. + +=head1 AUTHOR + +Toby Inkster Etobyink@cpan.orgE. + +=head1 COPYRIGHT AND LICENCE + +This software is copyright (c) 2013-2014, 2017, 2022 by Toby Inkster. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=head1 DISCLAIMER OF WARRANTIES + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + diff --git a/src/modules/Exporter/Tiny/Manual/Etc.pod b/src/modules/Exporter/Tiny/Manual/Etc.pod new file mode 100644 index 0000000..6da7615 --- /dev/null +++ b/src/modules/Exporter/Tiny/Manual/Etc.pod @@ -0,0 +1,134 @@ +=pod + +=encoding utf-8 + +=for stopwords frobnicate greps regexps + +=head1 NAME + +Exporter::Tiny::Manual::Etc - odds and ends + +=head1 DESCRIPTION + +=head2 Utility Functions + +Exporter::Tiny is itself an exporter! + +These functions are really for internal use, but can be exported if you +need them: + +=over + +=item C<< mkopt(\@array) >> + +Similar to C from L. It doesn't support all the +fancy options that Data::OptList does (C, C, +C and C) but runs about 50% faster. + +=item C<< mkopt_hash(\@array) >> + +Similar to C from L. See also C. + +=back + +=head2 History + +L had a bunch of custom exporting code which poked coderefs +into its caller's stash. It needed this to be something more powerful than +most exporters so that it could switch between exporting Moose, Mouse and +Moo-compatible objects on request. L would have been capable, +but had too many dependencies for the Type::Tiny project. + +Meanwhile L, L and L each +used the venerable L. However, this meant they were +unable to use the features like L-style function renaming +which I'd built into Type::Library: + + ## import "Str" but rename it to "String". + use Types::Standard "Str" => { -as => "String" }; + +And so I decided to factor out code that could be shared by all Type-Tiny's +exporters into a single place: Exporter::TypeTiny. + +As of version 0.026, Exporter::TypeTiny was also made available as +L, distributed independently on CPAN. CHOCOLATEBOY had +convinced me that it was mature enough to live a life of its own. + +As of version 0.030, Type-Tiny depends on Exporter::Tiny and +Exporter::TypeTiny is being phased out. + +=head2 Obligatory Exporter Comparison + +Exporting is unlikely to be your application's performance bottleneck, but +nonetheless here are some comparisons. + +B<< Comparative sizes according to L: >> + + Exporter 217.1Kb + Sub::Exporter::Progressive 263.2Kb + Exporter::Tiny 267.7Kb + Exporter + Exporter::Heavy 281.5Kb + Exporter::Renaming 406.2Kb + Sub::Exporter 701.0Kb + +B<< Performance exporting a single sub: >> + + Rate SubExp ExpTiny SubExpProg ExpPM +SubExp 2489/s -- -56% -85% -88% +ExpTiny 5635/s 126% -- -67% -72% +SubExpProg 16905/s 579% 200% -- -16% +ExpPM 20097/s 707% 257% 19% -- + +(Exporter::Renaming globally changes the behaviour of Exporter.pm, so could +not be included in the same benchmarks.) + +B<< (Non-Core) Dependencies: >> + + Exporter -1 + Exporter::Renaming 0 + Exporter::Tiny 0 + Sub::Exporter::Progressive 0 + Sub::Exporter 3 + +B<< Features: >> + + ExpPM ExpTiny SubExp SubExpProg + Can export code symbols............. Yes Yes Yes Yes + Can export non-code symbols......... Yes Yes + Groups/tags......................... Yes Yes Yes Yes + Export by regexp.................... Yes Yes + Bang prefix......................... Yes Yes + Allows renaming of subs............. Yes Yes Maybe + Install code into scalar refs....... Yes Yes Maybe + Can be passed an "into" parameter... Yes Yes Maybe + Can be passed an "installer" sub.... Yes Yes Maybe + Config avoids package variables..... Yes + Supports generators................. Yes Yes + Sane API for generators............. Yes Yes + Unimport............................ Yes + +(Certain Sub::Exporter::Progressive features are only available if +Sub::Exporter is installed.) + +=head1 SEE ALSO + +L, +L. + +=head1 AUTHOR + +Toby Inkster Etobyink@cpan.orgE. + +=head1 COPYRIGHT AND LICENCE + +This software is copyright (c) 2013-2014, 2017 by Toby Inkster. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=head1 DISCLAIMER OF WARRANTIES + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + diff --git a/src/modules/Exporter/Tiny/Manual/Exporting.pod b/src/modules/Exporter/Tiny/Manual/Exporting.pod new file mode 100644 index 0000000..d0db61a --- /dev/null +++ b/src/modules/Exporter/Tiny/Manual/Exporting.pod @@ -0,0 +1,266 @@ +=pod + +=encoding utf-8 + +=for stopwords frobnicate greps regexps + +=head1 NAME + +Exporter::Tiny::Manual::Exporting - creating an exporter using Exporter::Tiny + +=head1 SYNOPSIS + +B<< Read L first! >> + +=head1 DESCRIPTION + +Simple configuration works the same as L; inherit from +L, and use the C<< @EXPORT >>, C<< @EXPORT_OK >>, +and C<< %EXPORT_TAGS >> package variables to list subs to export. + +Unlike Exporter, Exporter::Tiny performs most of its internal duties +(including resolution of tag names to sub names, resolution of sub +names to coderefs, and installation of coderefs into the target +package) as B, which means that your module (which is a +subclass of Exporter::Tiny) can override them to provide interesting +behaviour. + +=head2 Advanced Tag Stuff + +You can define tags using other tags: + + use Exporter::Shiny qw( + black white red green blue cyan magenta yellow + ); + + our %EXPORT_TAGS = ( + rgb => [qw( red green blue )], + cym => [qw( cyan magenta yellow )], + cymk => [qw( black :cym )], + monochrome => [qw( black white )], + all => [qw( :rgb :cymk :monochrome )], + ); + +B<< CAVEAT: >> If you create a cycle in the tags, this could put +Exporter::Tiny into an infinite loop expanding the tags. Don't do that. + +=head2 More on Generators + +Exporter::Tiny has always allowed exported subs to be generated (like +L), but until version 0.025 did not have an especially nice +API for it. + +Now, it's easy. If you want to generate a sub C to export, list it in +C<< @EXPORT >> or C<< @EXPORT_OK >> as usual, and then simply give your +exporter module a class method called C<< _generate_foo >>. + + push @EXPORT_OK, 'foo'; + + sub _generate_foo { + my $class = shift; + my ($name, $args, $globals) = @_; + + return sub { + ...; + } + } + +We showed how to do that in L, but +one thing we didn't show was that C<< $globals >> gets passed in there. +This is the global options hash, as described in +L. It can often be useful. In +particular it will tell you what package the generated sub is destined +to be installed into. + +To generate non-code symbols, name your generators like this: + + sub _generateScalar_Foo { ... } # generate a symbol $Foo + sub _generateArray_Bar { ... } # generate a symbol @Bar + sub _generateHash_Baz { ... } # generate a symbol %Baz + +You can also generate tags: + + my %constants; + BEGIN { + %constants = (FOO => 1, BAR => 2); + } + use constant \%constants; + + $EXPORT_TAGS{constants} = sub { + my $class = shift; + my ($name, $args, $globals) = @_; + + return keys(%constants); + }; + +=head2 Hooks + +Sometimes as well as exporting stuff, you want to do some setup or +something. + +You can define a couple of class methods in your package, and they'll +get called at the appropriate time: + + package MyUtils; + + ...; + + sub _exporter_validate_opts { + my $class = shift; + my ($globals) = @_; + + ...; # do stuff here + + $class->SUPER::_exporter_validate_opts(@_); + } + + sub _exporter_validate_unimport_opts { + my $class = shift; + my ($globals) = @_; + + ...; # do stuff here + + $class->SUPER::_exporter_validate_unimport_opts(@_); + } + +The C<< $globals >> variable is that famous global options hash. In +particular, C<< $globals->{into} >> is useful because it tells you what +package has imported you. + +As you might have guessed, these methods were originally intended to +validate the global options hash, but can be used to perform any +general duties before the real exporting work is done. + +=head2 Overriding Internals + +An important difference between L and Exporter::Tiny is that +the latter calls all its internal functions as I<< class methods >>. This +means that your subclass can I<< override them >> to alter their behaviour. + +The following methods are available to be overridden. Despite being named +with a leading underscore, they are considered public methods. (The underscore +is there to avoid accidentally colliding with any of your own function names.) + +=over + +=item C<< _exporter_validate_opts($globals) >> + +Documented above. + +=item C<< _exporter_validate_unimport_opts($globals) >> + +Documented above. + +=item C<< _exporter_merge_opts($tag_opts, $globals, @exports) >> + +Called to merge options which have been provided for a tag into the +options provided for the exports that the tag expanded to. + +=item C<< _exporter_expand_tag($name, $args, $globals) >> + +This method is called to expand an import tag (e.g. C<< ":constants" >>). +It is passed the tag name (minus the leading ":"), an optional hashref +of options (like C<< { -prefix => "foo_" } >>), and the global options +hashref. + +It is expected to return a list of ($name, $args) arrayref pairs. These +names can be sub names to export, or further tag names (which must have +their ":"). If returning tag names, be careful to avoid creating a tag +expansion loop! + +The default implementation uses C<< %EXPORT_TAGS >> to expand tags, and +provides fallbacks for the C<< :default >> and C<< :all >> tags. + +=item C<< _exporter_expand_regexp($regexp, $args, $globals) >> + +Like C<_exporter_expand_regexp>, but given a regexp-like string instead +of a tag name. + +The default implementation greps through C<< @EXPORT_OK >> for imports, +and the list of already-imported functions for exports. + +=item C<< _exporter_expand_sub($name, $args, $globals) >> + +This method is called to translate a sub name to a hash of name => coderef +pairs for exporting to the caller. In general, this would just be a hash with +one key and one value, but, for example, L overrides this +method so that C<< "+Foo" >> gets expanded to: + + ( + Foo => sub { $type }, + is_Foo => sub { $type->check(@_) }, + to_Foo => sub { $type->assert_coerce(@_) }, + assert_Foo => sub { $type->assert_return(@_) }, + ) + +The default implementation checks that the name is allowed to be exported +(using the C<_exporter_permitted_regexp> method), gets the coderef using +the generator if there is one (or by calling C<< can >> on your exporter +otherwise) and calls C<_exporter_fail> if it's unable to generate or +retrieve a coderef. + +Despite the name, is also called for non-code symbols. + +=item C<< _exporter_permitted_regexp($globals) >> + +This method is called to retrieve a regexp for validating the names of +exportable subs. If a sub doesn't match the regexp, then the default +implementation of C<_exporter_expand_sub> will refuse to export it. (Of +course, you may override the default C<_exporter_expand_sub>.) + +The default implementation of this method assembles the regexp from +C<< @EXPORT >> and C<< @EXPORT_OK >>. + +=item C<< _exporter_fail($name, $args, $globals) >> + +Called by C<_exporter_expand_sub> if it can't find a coderef to export. + +The default implementation just throws an exception. But you could emit +a warning instead, or just ignore the failed export. + +If you don't throw an exception then you should be aware that this +method is called in list context, and any list it returns will be treated +as an C<_exporter_expand_sub>-style hash of names and coderefs for +export. + +=item C<< _exporter_install_sub($name, $args, $globals, $coderef) >> + +This method actually installs the exported sub into its new destination. +Its return value is ignored. + +The default implementation handles sub renaming (i.e. the C<< -as >>, +C<< -prefix >> and C<< -suffix >> functions. This method does a lot of +stuff; if you need to override it, it's probably a good idea to just +pre-process the arguments and then call the super method rather than +trying to handle all of it yourself. + +Despite the name, is also called for non-code symbols. + +=item C<< _exporter_uninstall_sub($name, $args, $globals) >> + +The opposite of C<_exporter_install_sub>. + +=back + +=head1 SEE ALSO + +L, +L. + +=head1 AUTHOR + +Toby Inkster Etobyink@cpan.orgE. + +=head1 COPYRIGHT AND LICENCE + +This software is copyright (c) 2013-2014, 2017 by Toby Inkster. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=head1 DISCLAIMER OF WARRANTIES + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + diff --git a/src/modules/Exporter/Tiny/Manual/Importing.pod b/src/modules/Exporter/Tiny/Manual/Importing.pod new file mode 100644 index 0000000..336f25c --- /dev/null +++ b/src/modules/Exporter/Tiny/Manual/Importing.pod @@ -0,0 +1,269 @@ +=pod + +=encoding utf-8 + +=for stopwords frobnicate greps regexps + +=head1 NAME + +Exporter::Tiny::Manual::Importing - importing from Exporter::Tiny-based modules + +=head1 DESCRIPTION + +For the purposes of this discussion we'll assume we have a module called +C<< MyUtils >> which exports functions called C, C, +C, and C. It has a tag set up called C<:colours> which +corresponds to C, C, and C. + +Many of these tricks may seem familiar from L. That is +intentional. Exporter::Tiny doesn't attempt to provide every feature of +Sub::Exporter, but where it does it usually uses a fairly similar API. + +=head2 Basic importing + +It's easy to import a single function from a module: + + use MyUtils "frobnicate"; + +Or a list of functions: + + use MyUtils "red", "green"; + +Perl's C<< qw() >> shorthand for a list of words is pretty useful: + + use MyUtils qw( red green ); + +If the module defines tags, you can import them like this: + + use MyUtils qw( :colours ); + +Or with a hyphen instead of a colon: + + use MyUtils qw( -colours ); + +Hyphens are good because Perl will autoquote a bareword that follows +them: + + use MyUtils -colours; + +And it's possible to mix function names and tags in the same list: + + use MyUtils qw( frobnicate :colours ); + +=head2 Renaming imported functions + +It's possible to rename a function you're importing: + + use MyUtils "frobnicate" => { -as => "frob" }; + +Or you can apply a prefix and/or suffix. The following imports the +function and calls it C. + + use MyUtils "frobnicate" => { -prefix => "my_", -suffix => "_thing" }; + +You can apply a prefix/suffix to B functions you import by +placing the hashref B in the import list. (This first hashref +is referred to as the global options hash, and can do some special +things.) + + use MyUtils { prefix => "my_" }, "frobnicate"; + +Did you notice that we used C<< -prefix >> and C<< -suffix >> in the +normal options hash, but C<< prefix >> and C<< suffix >> (no hyphen) +in the global options hash? That's a common pattern with this module. + +You can import the same function multiple times with different names: + + use MyUtils + "frobnicate" => { -as => "frob" }, + "frobnicate" => { -as => "frbnct" }; + +Tags can take the C<< -prefix >> and C<< -suffix >> options too. The +following imports C, C, and C: + + use MyUtils -colours => { -prefix => "colour_" }; + +You can also set C<< -as >> to be a coderef to generate a function +name. This imports functions called C, C, and C: + + use MyUtils -colours => { -as => sub { uc($_[0]) } }; + +Note that it doesn't make sense to use C<< -as >> with a tag unless +you're doing this coderef thing. Coderef C<< as >> also works in the +global options hash. + +=head2 DO NOT WANT! + +Sometimes you want to supply a list of functions you B<< don't >> want +to import. To do that, prefix the function with a bang. This imports +everything except "frobnicate": + + use MyUtils qw( -all !frobnicate ); + +You can add the bang prefix to tags too. This will import everything +except the colours. + + use MyUtils qw( -all !:colours ); + +Negated imports always "win", so the following will not import +"frobnicate", no matter how many times you repeat it... + + use MyUtils qw( !frobnicate frobnicate frobnicate frobnicate ); + +=head2 Importing by regexp + +Here's how you could import all functions beginning with an "f": + + use MyUtils qw( /^F/i ); + +Or import everything except functions beginning with a "z": + + use MyUtils qw( -all !/^Z/i ); + +Note that regexps are always supplied as I starting with +C<< "/" >>, and not as quoted regexp references (C<< qr/.../ >>). + +=head2 Import functions into another package + +Occasionally you need to import functions not into your own package, +but into a different package. You can do that like this: + + use MyUtils { into => "OtherPkg" }, "frobnicate"; + + OtherPkg::frobincate(...); + +However, L will probably provide you with a better +approach which doesn't just work with Exporter::Tiny, but B +exporters. + +=head2 Lexical subs on Perl 5.37.2 and above + +Often you want to make use of an exported function, but don't want +it to "pollute" your namespace. + +On newer versions of Perl, Exporter::Tiny can use C +from L to give you lexical versions of exports. + + { + use MyUtils -lexical, "frobnicate"; + + frobnicate(...); # ok + } + + frobnicate(...); # not ok + +This functionality should be considered B until +C is included in a stable release of Perl. + +=head2 Lexical subs on Perl older than 5.37.2 + +There is this L thing that was designed as a +plugin for L, but Exporter::Tiny's API is close enough +that it will work. Do you remember that global options hash? Just +use that to tell Exporter::Tiny to use an alternative sub installer. + + { + use Sub::Exporter::Lexical lexical_installer => { -as => "lex" }; + use MyUtils { installer => lex }, "frobnicate"; + + frobnicate(...); # ok + } + + frobnicate(...); # not ok + +Another way to do lexical functions is to import a function into a +scalar variable: + + my $func; + use MyUtils "frobnicate" => { -as => \$func }; + + $func->(...); + +You can even provide a hashref to put all imported functions into as +part of that global options hash I mentioned earlier. + + my %funcs; + use MyUtils { into => \%funcs }, "frobnicate"; + + $funcs{frobnicate}->(...); + +=head2 Unimporting + +You can unimport the functions that MyUtils added to your namespace: + + no MyUtils; + +Or just specific ones: + + no MyUtils qw(frobnicate); + +If you renamed a function when you imported it, you should unimport by +the new name: + + use MyUtils frobnicate => { -as => "frob" }; + ...; + no MyUtils "frob"; + +Unimporting using tags and regexps should mostly do what you want. + +=head1 DIAGNOSTICS + +=over + +=item B<< Overwriting existing sub '%s::%s' with sub '%s' exported by %s >> + +A warning issued if Exporter::Tiny is asked to export a symbol which +will result in an existing sub being overwritten. This warning can be +suppressed using either of the following: + + use MyUtils { replace => 1 }, "frobnicate"; + use MyUtils "frobnicate" => { -replace => 1 }; + +Or can be upgraded to a fatal error: + + use MyUtils { replace => "die" }, "frobnicate"; + use MyUtils "frobnicate" => { -replace => "die" }; + +=item B<< Refusing to overwrite existing sub '%s::%s' with sub '%s' exported by %s >> + +The fatal version of the above warning. + +=item B<< Could not find sub '%s' exported by %s >> + +You requested to import a sub which the package does not provide. + +=item B<< Cannot provide an -as option for tags >> + +Because a tag may provide more than one function, it does not make sense +to request a single name for it. Instead use C<< -prefix >> or C<< -suffix >>. + +=item B<< Passing options to unimport '%s' makes no sense >> + +When you import a sub, it occasionally makes sense to pass some options +for it. However, when unimporting, options do nothing, so this warning +is issued. + +=back + +=head1 SEE ALSO + +L, +L. + +=head1 AUTHOR + +Toby Inkster Etobyink@cpan.orgE. + +=head1 COPYRIGHT AND LICENCE + +This software is copyright (c) 2013-2014, 2017 by Toby Inkster. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=head1 DISCLAIMER OF WARRANTIES + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + diff --git a/src/modules/Exporter/Tiny/Manual/QuickStart.pod b/src/modules/Exporter/Tiny/Manual/QuickStart.pod new file mode 100644 index 0000000..13335be --- /dev/null +++ b/src/modules/Exporter/Tiny/Manual/QuickStart.pod @@ -0,0 +1,195 @@ +=pod + +=encoding utf-8 + +=for stopwords frobnicate greps regexps + +=head1 NAME + +Exporter::Tiny::Manual::QuickStart - the quickest way to get up and running with Exporter::Tiny + +=head1 SYNOPSIS + + package MyUtils; + + use Exporter::Shiny qw( frobnicate ); + + sub frobnicate { + ...; # your code here + } + + 1; + +Now people can use your module like this: + + use MyUtils "frobnicate"; + + frobnicate(42); + +Or like this: + + use MyUtils "frobnicate" => { -as => "frob" }; + + frob(42); + +=head1 DESCRIPTION + +See the synopsis. Yes, it's that simple. + +=head2 Next steps + +=head3 Default exports + +Note that the module in the synopsis doesn't export anything by default. +If people load C like this: + + use MyUtils; + +Then they haven't imported any functions. You can specify a default set +of functions to be exported like this: + + package MyUtils; + + use Exporter::Shiny qw( frobnicate ); + + our @EXPORT = qw( frobnicate ); + + sub frobnicate { ... } + + 1; + +Or, if you want to be a superstar rock god: + + package MyUtils; + + use Exporter::Shiny our @EXPORT = qw( frobnicate ); + + sub frobnicate { ... } + + 1; + +=head3 Tags + +You can provide tags for people to use: + + package MyUtils; + + use Exporter::Shiny qw( frobnicate red green blue ); + + our %EXPORT_TAGS = ( + utils => [qw/ frobnicate /], + colours => [qw/ red green blue /], + ); + + sub frobnicate { ... } + sub red { ... } + sub green { ... } + sub blue { ... } + + 1; + +And people can now import your functions like this: + + use MyUtils ":colours"; + +Or this: + + use MyUtils "-colours"; + +Or take advantage of the fact that Perl magically quotes barewords +preceded by a hyphen: + + use MyUtils -colours; + +Two tags are automatically defined for you: C<< -default >> (which is +just the same as C<< @EXPORT >>) and C<< -all >> (which is the union of +C<< @EXPORT >> and C<< @EXPORT_OK >>). If you don't like them, then you +can override them: + + our %EXPORT_TAGS = ( + default => \@some_other_stuff, + all => \@more_stuff, + ); + +=head3 Generators + +Exporting normally just works by copying a sub from your package into +your caller's package. But sometimes it's useful instead to generate +a I sub to insert into your caller's package. This is pretty +easy to do. + + package MyUtils; + + use Exporter::Shiny qw( frobnicate ); + + sub _generate_frobnicate { + my ( $me, $name, $args, $globals ) = @_; + my $caller = $globals->{into}; + + return sub { + ...; # your code here + }; + } + + 1; + +The parameter C<< $me >> here is a string containing the package name +which is being imported from; C<< $caller >> is the destination package; +C<< $name >> is the name of the sub (in this case "frobnicate"); and +C<< $args >> is a custom argument for this function. (By convention, +C<< $args >> is normally a hashref.) + + # The hashref { foo => 42 } is $args above. + # + use MyUtils "frobnicate" => { foo => 42 }; + +=head2 Avoiding Exporter::Shiny + +Exporter::Shiny is a tiny shim around Exporter::Tiny. It should mostly +do what you want, but you may sometimes prefer to use Exporter::Tiny +directly. + +The example in the synopsis could have been written as: + + package MyUtils; + + use parent "Exporter::Tiny"; + our @EXPORT_OK = qw( frobnicate ); + + sub frobnicate { + ...; # your code here + } + + 1; + +What Exporter::Shiny does is mostly just to set C<< @EXPORT_OK >> for +you and set up inheritance from the base class (Exporter::Tiny). + +Exporter::Shiny also sets C<< $INC{'MyUtils.pm'} >> for you, which in +usually makes little difference, but is useful in some edge cases. + +=head1 SEE ALSO + +L, +L. + +For more advanced information, see +L. + +=head1 AUTHOR + +Toby Inkster Etobyink@cpan.orgE. + +=head1 COPYRIGHT AND LICENCE + +This software is copyright (c) 2013-2014, 2017 by Toby Inkster. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=head1 DISCLAIMER OF WARRANTIES + +THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + diff --git a/src/modules/List/MoreUtils.pm b/src/modules/List/MoreUtils.pm new file mode 100644 index 0000000..05152fd --- /dev/null +++ b/src/modules/List/MoreUtils.pm @@ -0,0 +1,1286 @@ +package List::MoreUtils; + +use 5.008_001; +use strict; +use warnings; + +my $have_xs; +our $VERSION = '0.430'; + +BEGIN +{ + unless (defined($have_xs)) + { + ## no critic (ErrorHandling::RequireCheckingReturnValueOfEval) + eval { require List::MoreUtils::XS; } unless $ENV{LIST_MOREUTILS_PP}; + ## no critic (ErrorHandling::RequireCarping) + die $@ if $@ && defined $ENV{LIST_MOREUTILS_PP} && $ENV{LIST_MOREUTILS_PP} == 0; + $have_xs = 0 + defined($INC{'List/MoreUtils/XS.pm'}); + } + + use List::MoreUtils::PP qw(); +} + +use Exporter::Tiny qw(); + +my @junctions = qw(any all none notall); +my @v0_22 = qw( + true false + firstidx lastidx + insert_after insert_after_string + apply indexes + after after_incl before before_incl + firstval lastval + each_array each_arrayref + pairwise natatime + mesh uniq + minmax part + _XScompiled +); +my @v0_24 = qw(bsearch); +my @v0_33 = qw(sort_by nsort_by); +my @v0_400 = qw(one any_u all_u none_u notall_u one_u + firstres onlyidx onlyval onlyres lastres + singleton bsearchidx +); +my @v0_420 = qw(arrayify duplicates minmaxstr samples zip6 reduce_0 reduce_1 reduce_u + listcmp frequency occurrences mode + binsert bremove equal_range lower_bound upper_bound qsort + slide slideatatime); + +my @all_functions = (@junctions, @v0_22, @v0_24, @v0_33, @v0_400, @v0_420); + +## no critic (TestingAndDebugging::ProhibitNoStrict) +no strict "refs"; +if ($have_xs) +{ + my $x; + for (@all_functions) + { + List::MoreUtils->can($_) or *$_ = $x if ($x = List::MoreUtils::XS->can($_)); + } +} +List::MoreUtils->can($_) or *$_ = List::MoreUtils::PP->can($_) for (@all_functions); +use strict; +## use critic (TestingAndDebugging::ProhibitNoStrict) +use parent qw(Exporter::Tiny); + +my %alias_list = ( + v0_22 => { + first_index => "firstidx", + last_index => "lastidx", + first_value => "firstval", + last_value => "lastval", + zip => "mesh", + }, + v0_33 => { + distinct => "uniq", + }, + v0_400 => { + first_result => "firstres", + only_index => "onlyidx", + only_value => "onlyval", + only_result => "onlyres", + last_result => "lastres", + bsearch_index => "bsearchidx", + }, + v0_420 => { + bsearch_insert => "binsert", + bsearch_remove => "bremove", + zip_unflatten => "zip6", + }, +); + +our @EXPORT_OK = (@all_functions, map { keys %$_ } values %alias_list); +our %EXPORT_TAGS = ( + all => \@EXPORT_OK, + 'like_0.22' => [ + any_u => {-as => 'any'}, + all_u => {-as => 'all'}, + none_u => {-as => 'none'}, + notall_u => {-as => 'notall'}, + @v0_22, + keys %{$alias_list{v0_22}}, + ], + 'like_0.24' => [ + any_u => {-as => 'any'}, + all_u => {-as => 'all'}, + notall_u => {-as => 'notall'}, + 'none', + @v0_22, + @v0_24, + keys %{$alias_list{v0_22}}, + ], + 'like_0.33' => [ + @junctions, + @v0_22, + # v0_24 functions were omitted + @v0_33, + keys %{$alias_list{v0_22}}, + keys %{$alias_list{v0_33}}, + ], +); + +for my $set (values %alias_list) +{ + for my $alias (keys %$set) + { + ## no critic (TestingAndDebugging::ProhibitNoStrict) + no strict qw(refs); + *$alias = __PACKAGE__->can($set->{$alias}); + ## use critic (TestingAndDebugging::ProhibitNoStrict) + } +} +use strict; + +=pod + +=head1 NAME + +List::MoreUtils - Provide the stuff missing in List::Util + +=head1 SYNOPSIS + + # import specific functions + + use List::MoreUtils qw(any uniq); + + if ( any { /foo/ } uniq @has_duplicates ) { + # do stuff + } + + # import everything + + use List::MoreUtils ':all'; + + # import by API + + # has "original" any/all/none/notall behavior + use List::MoreUtils ':like_0.22'; + # 0.22 + bsearch + use List::MoreUtils ':like_0.24'; + # has "simplified" any/all/none/notall behavior + (n)sort_by + use List::MoreUtils ':like_0.33'; + +=head1 DESCRIPTION + +B provides some trivial but commonly needed functionality on +lists which is not going to go into L. + +All of the below functions are implementable in only a couple of lines of Perl +code. Using the functions from this module however should give slightly better +performance as everything is implemented in C. The pure-Perl implementation of +these functions only serves as a fallback in case the C portions of this module +couldn't be compiled on this machine. + +=head1 EXPORTS + +=head2 Default behavior + +Nothing by default. To import all of this module's symbols use the C<:all> tag. +Otherwise functions can be imported by name as usual: + + use List::MoreUtils ':all'; + + use List::MoreUtils qw{ any firstidx }; + +Because historical changes to the API might make upgrading List::MoreUtils +difficult for some projects, the legacy API is available via special import +tags. + +=head2 Like version 0.22 (last release with original API) + +This API was available from 2006 to 2009, returning undef for empty lists on +C/C/C/C: + + use List::MoreUtils ':like_0.22'; + +This import tag will import all functions available as of version 0.22. +However, it will import C as C, C as C, C as +C, and C as C. + +=head2 Like version 0.24 (first incompatible change) + +This API was available from 2010 to 2011. It changed the return value of C +and added the C function. + + use List::MoreUtils ':like_0.24'; + +This import tag will import all functions available as of version 0.24. +However it will import C as C, C as C, and +C as C. It will import C as described in +the documentation below (true for empty list). + +=head2 Like version 0.33 (second incompatible change) + +This API was available from 2011 to 2014. It is widely used in several CPAN +modules and thus it's closest to the current API. It changed the return values +of C, C, and C. It added the C and C functions +and the C alias for C. It omitted C. + + use List::MoreUtils ':like_0.33'; + +This import tag will import all functions available as of version 0.33. Note: +it will not import C for consistency with the 0.33 API. + +=head1 FUNCTIONS + +=head2 Junctions + +=head3 I + +There are two schools of thought for how to evaluate a junction on an +empty list: + +=over + +=item * + +Reduction to an identity (boolean) + +=item * + +Result is undefined (three-valued) + +=back + +In the first case, the result of the junction applied to the empty list is +determined by a mathematical reduction to an identity depending on whether +the underlying comparison is "or" or "and". Conceptually: + + "any are true" "all are true" + -------------- -------------- + 2 elements: A || B || 0 A && B && 1 + 1 element: A || 0 A && 1 + 0 elements: 0 1 + +In the second case, three-value logic is desired, in which a junction +applied to an empty list returns C rather than true or false + +Junctions with a C<_u> suffix implement three-valued logic. Those +without are boolean. + +=head3 all BLOCK LIST + +=head3 all_u BLOCK LIST + +Returns a true value if all items in LIST meet the criterion given through +BLOCK. Sets C<$_> for each item in LIST in turn: + + print "All values are non-negative" + if all { $_ >= 0 } ($x, $y, $z); + +For an empty LIST, C returns true (i.e. no values failed the condition) +and C returns C. + +Thus, C<< all_u(@list) >> is equivalent to C<< @list ? all(@list) : undef >>. + +B: because Perl treats C as false, you must check the return value +of C with C or you will get the opposite result of what you +expect. + +=head3 any BLOCK LIST + +=head3 any_u BLOCK LIST + +Returns a true value if any item in LIST meets the criterion given through +BLOCK. Sets C<$_> for each item in LIST in turn: + + print "At least one non-negative value" + if any { $_ >= 0 } ($x, $y, $z); + +For an empty LIST, C returns false and C returns C. + +Thus, C<< any_u(@list) >> is equivalent to C<< @list ? any(@list) : undef >>. + +=head3 none BLOCK LIST + +=head3 none_u BLOCK LIST + +Logically the negation of C. Returns a true value if no item in LIST meets +the criterion given through BLOCK. Sets C<$_> for each item in LIST in turn: + + print "No non-negative values" + if none { $_ >= 0 } ($x, $y, $z); + +For an empty LIST, C returns true (i.e. no values failed the condition) +and C returns C. + +Thus, C<< none_u(@list) >> is equivalent to C<< @list ? none(@list) : undef >>. + +B: because Perl treats C as false, you must check the return value +of C with C or you will get the opposite result of what you +expect. + +=head3 notall BLOCK LIST + +=head3 notall_u BLOCK LIST + +Logically the negation of C. Returns a true value if not all items in LIST +meet the criterion given through BLOCK. Sets C<$_> for each item in LIST in +turn: + + print "Not all values are non-negative" + if notall { $_ >= 0 } ($x, $y, $z); + +For an empty LIST, C returns false and C returns C. + +Thus, C<< notall_u(@list) >> is equivalent to C<< @list ? notall(@list) : undef >>. + +=head3 one BLOCK LIST + +=head3 one_u BLOCK LIST + +Returns a true value if precisely one item in LIST meets the criterion +given through BLOCK. Sets C<$_> for each item in LIST in turn: + + print "Precisely one value defined" + if one { defined($_) } @list; + +Returns false otherwise. + +For an empty LIST, C returns false and C returns C. + +The expression C is almost equivalent to +C<1 == true BLOCK LIST>, except for short-cutting. +Evaluation of BLOCK will immediately stop at the second true value. + +=head2 Transformation + +=head3 apply BLOCK LIST + +Applies BLOCK to each item in LIST and returns a list of the values after BLOCK +has been applied. In scalar context, the last element is returned. This +function is similar to C but will not modify the elements of the input +list: + + my @list = (1 .. 4); + my @mult = apply { $_ *= 2 } @list; + print "\@list = @list\n"; + print "\@mult = @mult\n"; + __END__ + @list = 1 2 3 4 + @mult = 2 4 6 8 + +Think of it as syntactic sugar for + + for (my @mult = @list) { $_ *= 2 } + +=head3 insert_after BLOCK VALUE LIST + +Inserts VALUE after the first item in LIST for which the criterion in BLOCK is +true. Sets C<$_> for each item in LIST in turn. + + my @list = qw/This is a list/; + insert_after { $_ eq "a" } "longer" => @list; + print "@list"; + __END__ + This is a longer list + +=head3 insert_after_string STRING VALUE LIST + +Inserts VALUE after the first item in LIST which is equal to STRING. + + my @list = qw/This is a list/; + insert_after_string "a", "longer" => @list; + print "@list"; + __END__ + This is a longer list + +=head3 pairwise BLOCK ARRAY1 ARRAY2 + +Evaluates BLOCK for each pair of elements in ARRAY1 and ARRAY2 and returns a +new list consisting of BLOCK's return values. The two elements are set to C<$a> +and C<$b>. Note that those two are aliases to the original value so changing +them will modify the input arrays. + + @a = (1 .. 5); + @b = (11 .. 15); + @x = pairwise { $a + $b } @a, @b; # returns 12, 14, 16, 18, 20 + + # mesh with pairwise + @a = qw/a b c/; + @b = qw/1 2 3/; + @x = pairwise { ($a, $b) } @a, @b; # returns a, 1, b, 2, c, 3 + +=head3 mesh ARRAY1 ARRAY2 [ ARRAY3 ... ] + +=head3 zip ARRAY1 ARRAY2 [ ARRAY3 ... ] + +Returns a list consisting of the first elements of each array, then +the second, then the third, etc, until all arrays are exhausted. + +Examples: + + @x = qw/a b c d/; + @y = qw/1 2 3 4/; + @z = mesh @x, @y; # returns a, 1, b, 2, c, 3, d, 4 + + @a = ('x'); + @b = ('1', '2'); + @c = qw/zip zap zot/; + @d = mesh @a, @b, @c; # x, 1, zip, undef, 2, zap, undef, undef, zot + +C is an alias for C. + +=head3 zip6 + +=head3 zip_unflatten + +Returns a list of arrays consisting of the first elements of each array, +then the second, then the third, etc, until all arrays are exhausted. + + @x = qw/a b c d/; + @y = qw/1 2 3 4/; + @z = zip6 @x, @y; # returns [a, 1], [b, 2], [c, 3], [d, 4] + + @a = ('x'); + @b = ('1', '2'); + @c = qw/zip zap zot/; + @d = zip6 @a, @b, @c; # [x, 1, zip], [undef, 2, zap], [undef, undef, zot] + +C is an alias for C. + +=head3 listcmp ARRAY0 ARRAY1 [ ARRAY2 ... ] + +Returns an associative list of elements and every I of the list it +was found in. Allows easy implementation of @a & @b, @a | @b, @a ^ @b and +so on. +Undefined entries in any given array are skipped. + + my @a = qw(one two three four five six seven eight nine ten eleven twelve thirteen); + my @b = qw(two three five seven eleven thirteen seventeen); + my @c = qw(one one two three five eight thirteen twentyone); + my %cmp = listcmp @a, @b, @c; # returns (one => [0, 2], two => [0, 1, 2], three => [0, 1, 2], four => [0], ...) + + my @seq = (1, 2, 3); + my @prim = (undef, 2, 3, 5); + my @fib = (1, 1, 2); + my %cmp = listcmp @seq, @prim, @fib; + # returns ( 1 => [0, 2], 2 => [0, 1, 2], 3 => [0, 1], 5 => [1] ) + +=head3 arrayify LIST[,LIST[,LIST...]] + +Returns a list consisting of each element of given arrays. Recursive arrays +are flattened, too. + + @a = (1, [[2], 3], 4, [5], 6, [7], 8, 9); + @l = arrayify @a; # returns 1, 2, 3, 4, 5, 6, 7, 8, 9 + +=head3 uniq LIST + +=head3 distinct LIST + +Returns a new list by stripping duplicate values in LIST by comparing +the values as hash keys, except that undef is considered separate from ''. +The order of elements in the returned list is the same as in LIST. In +scalar context, returns the number of unique elements in LIST. + + my @x = uniq 1, 1, 2, 2, 3, 5, 3, 4; # returns 1 2 3 5 4 + my $x = uniq 1, 1, 2, 2, 3, 5, 3, 4; # returns 5 + # returns "Mike", "Michael", "Richard", "Rick" + my @n = distinct "Mike", "Michael", "Richard", "Rick", "Michael", "Rick" + # returns "A8", "", undef, "A5", "S1" + my @s = distinct "A8", "", undef, "A5", "S1", "A5", "A8" + # returns "Giulia", "Giulietta", undef, "", 156, "GTA", "GTV", 159, "Brera", "4C" + my @w = uniq "Giulia", "Giulietta", undef, "", 156, "GTA", "GTV", 159, "Brera", "4C", "Giulietta", "Giulia" + +C is an alias for C. + +B can be used to give feedback about this behavior. + +=head3 singleton LIST + +Returns a new list by stripping values in LIST occurring more than once by +comparing the values as hash keys, except that undef is considered separate +from ''. The order of elements in the returned list is the same as in LIST. +In scalar context, returns the number of elements occurring only once in LIST. + + my @x = singleton 1,1,2,2,3,4,5 # returns 3 4 5 + +=head3 duplicates LIST + +Returns a new list by stripping values in LIST occurring less than twice by +comparing the values as hash keys, except that undef is considered separate +from ''. The order of elements in the returned list is the same as in LIST. +In scalar context, returns the number of elements occurring more than once +in LIST. + + my @y = duplicates 1,1,2,4,7,2,3,4,6,9; #returns 1,2,4 + +=head3 frequency LIST + +Returns an associative list of distinct values and the corresponding frequency. + + my @f = frequency values %radio_nrw; # returns ( + # 'Deutschlandfunk (DLF)' => 9, 'WDR 3' => 10, + # 'WDR 4' => 11, 'WDR 5' => 14, 'WDR Eins Live' => 14, + # 'Deutschlandradio Kultur' => 8,...) + +=head3 occurrences LIST + +Returns a new list of frequencies and the corresponding values from LIST. + + my @o = occurrences ((1) x 3, (2) x 4, (3) x 2, (4) x 7, (5) x 2, (6) x 4); + # @o = (undef, undef, [3, 5], [1], [2, 6], undef, undef, [4]); + +=head3 mode LIST + +Returns the modal value of LIST. In scalar context, just the modal value +is returned, in list context all probes occurring I times are returned, +too. + + my @m = mode ((1) x 3, (2) x 4, (3) x 2, (4) x 7, (5) x 2, (6) x 4, (7) x 3, (8) x 7); + # @m = (7, 4, 8) - bimodal LIST + +=head3 slide BLOCK LIST + +The function C operates on pairs of list elements like: + + my @s = slide { "$a and $b" } (0..3); + # @s = ("0 and 1", "1 and 2", "2 and 3") + +The idea behind this function is a kind of magnifying glass that is moved +along a list and calls C every time the next list item is reached. + +=head2 Partitioning + +=head3 after BLOCK LIST + +Returns a list of the values of LIST after (and not including) the point +where BLOCK returns a true value. Sets C<$_> for each element in LIST in turn. + + @x = after { $_ % 5 == 0 } (1..9); # returns 6, 7, 8, 9 + +=head3 after_incl BLOCK LIST + +Same as C but also includes the element for which BLOCK is true. + +=head3 before BLOCK LIST + +Returns a list of values of LIST up to (and not including) the point where BLOCK +returns a true value. Sets C<$_> for each element in LIST in turn. + +=head3 before_incl BLOCK LIST + +Same as C but also includes the element for which BLOCK is true. + +=head3 part BLOCK LIST + +Partitions LIST based on the return value of BLOCK which denotes into which +partition the current value is put. + +Returns a list of the partitions thusly created. Each partition created is a +reference to an array. + + my $i = 0; + my @part = part { $i++ % 2 } 1 .. 8; # returns [1, 3, 5, 7], [2, 4, 6, 8] + +You can have a sparse list of partitions as well where non-set partitions will +be undef: + + my @part = part { 2 } 1 .. 10; # returns undef, undef, [ 1 .. 10 ] + +Be careful with negative values, though: + + my @part = part { -1 } 1 .. 10; + __END__ + Modification of non-creatable array value attempted, subscript -1 ... + +Negative values are only ok when they refer to a partition previously created: + + my @idx = ( 0, 1, -1 ); + my $i = 0; + my @part = part { $idx[$i++ % 3] } 1 .. 8; # [1, 4, 7], [2, 3, 5, 6, 8] + +=head3 samples COUNT LIST + +Returns a new list containing COUNT random samples from LIST. Is similar to +L, but stops after COUNT. + + @r = samples 10, 1..10; # same as shuffle + @r2 = samples 5, 1..10; # gives 5 values from 1..10; + +=head2 Iteration + +=head3 each_array ARRAY1 ARRAY2 ... + +Creates an array iterator to return the elements of the list of arrays ARRAY1, +ARRAY2 throughout ARRAYn in turn. That is, the first time it is called, it +returns the first element of each array. The next time, it returns the second +elements. And so on, until all elements are exhausted. + +This is useful for looping over more than one array at once: + + my $ea = each_array(@a, @b, @c); + while ( my ($a, $b, $c) = $ea->() ) { .... } + +The iterator returns the empty list when it reached the end of all arrays. + +If the iterator is passed an argument of 'C', then it returns +the index of the last fetched set of values, as a scalar. + +=head3 each_arrayref LIST + +Like each_array, but the arguments are references to arrays, not the +plain arrays. + +=head3 natatime EXPR, LIST + +Creates an array iterator, for looping over an array in chunks of +C<$n> items at a time. (n at a time, get it?). An example is +probably a better explanation than I could give in words. + +Example: + + my @x = ('a' .. 'g'); + my $it = natatime 3, @x; + while (my @vals = $it->()) + { + print "@vals\n"; + } + +This prints + + a b c + d e f + g + +=head3 slideatatime STEP, WINDOW, LIST + +Creates an array iterator, for looping over an array in chunks of +C<$windows-size> items at a time. + +The idea behind this function is a kind of magnifying glass (finer +controllable compared to L) that is moved along a list. + +Example: + + my @x = ('a' .. 'g'); + my $it = slideatatime 2, 3, @x; + while (my @vals = $it->()) + { + print "@vals\n"; + } + +This prints + + a b c + c d e + e f g + g + +=head2 Searching + +=head3 firstval BLOCK LIST + +=head3 first_value BLOCK LIST + +Returns the first element in LIST for which BLOCK evaluates to true. Each +element of LIST is set to C<$_> in turn. Returns C if no such element +has been found. + +C is an alias for C. + +=head3 onlyval BLOCK LIST + +=head3 only_value BLOCK LIST + +Returns the only element in LIST for which BLOCK evaluates to true. Sets +C<$_> for each item in LIST in turn. Returns C if no such element +has been found. + +C is an alias for C. + +=head3 lastval BLOCK LIST + +=head3 last_value BLOCK LIST + +Returns the last value in LIST for which BLOCK evaluates to true. Each element +of LIST is set to C<$_> in turn. Returns C if no such element has been +found. + +C is an alias for C. + +=head3 firstres BLOCK LIST + +=head3 first_result BLOCK LIST + +Returns the result of BLOCK for the first element in LIST for which BLOCK +evaluates to true. Each element of LIST is set to C<$_> in turn. Returns +C if no such element has been found. + +C is an alias for C. + +=head3 onlyres BLOCK LIST + +=head3 only_result BLOCK LIST + +Returns the result of BLOCK for the first element in LIST for which BLOCK +evaluates to true. Sets C<$_> for each item in LIST in turn. Returns +C if no such element has been found. + +C is an alias for C. + +=head3 lastres BLOCK LIST + +=head3 last_result BLOCK LIST + +Returns the result of BLOCK for the last element in LIST for which BLOCK +evaluates to true. Each element of LIST is set to C<$_> in turn. Returns +C if no such element has been found. + +C is an alias for C. + +=head3 indexes BLOCK LIST + +Evaluates BLOCK for each element in LIST (assigned to C<$_>) and returns a list +of the indices of those elements for which BLOCK returned a true value. This is +just like C only that it returns indices instead of values: + + @x = indexes { $_ % 2 == 0 } (1..10); # returns 1, 3, 5, 7, 9 + +=head3 firstidx BLOCK LIST + +=head3 first_index BLOCK LIST + +Returns the index of the first element in LIST for which the criterion in BLOCK +is true. Sets C<$_> for each item in LIST in turn: + + my @list = (1, 4, 3, 2, 4, 6); + printf "item with index %i in list is 4", firstidx { $_ == 4 } @list; + __END__ + item with index 1 in list is 4 + +Returns C<-1> if no such item could be found. + +C is an alias for C. + +=head3 onlyidx BLOCK LIST + +=head3 only_index BLOCK LIST + +Returns the index of the only element in LIST for which the criterion +in BLOCK is true. Sets C<$_> for each item in LIST in turn: + + my @list = (1, 3, 4, 3, 2, 4); + printf "uniqe index of item 2 in list is %i", onlyidx { $_ == 2 } @list; + __END__ + unique index of item 2 in list is 4 + +Returns C<-1> if either no such item or more than one of these +has been found. + +C is an alias for C. + +=head3 lastidx BLOCK LIST + +=head3 last_index BLOCK LIST + +Returns the index of the last element in LIST for which the criterion in BLOCK +is true. Sets C<$_> for each item in LIST in turn: + + my @list = (1, 4, 3, 2, 4, 6); + printf "item with index %i in list is 4", lastidx { $_ == 4 } @list; + __END__ + item with index 4 in list is 4 + +Returns C<-1> if no such item could be found. + +C is an alias for C. + +=head2 Sorting + +=head3 sort_by BLOCK LIST + +Returns the list of values sorted according to the string values returned by the +KEYFUNC block or function. A typical use of this may be to sort objects according +to the string value of some accessor, such as + + sort_by { $_->name } @people + +The key function is called in scalar context, being passed each value in turn as +both $_ and the only argument in the parameters, @_. The values are then sorted +according to string comparisons on the values returned. +This is equivalent to + + sort { $a->name cmp $b->name } @people + +except that it guarantees the name accessor will be executed only once per value. +One interesting use-case is to sort strings which may have numbers embedded in them +"naturally", rather than lexically. + + sort_by { s/(\d+)/sprintf "%09d", $1/eg; $_ } @strings + +This sorts strings by generating sort keys which zero-pad the embedded numbers to +some level (9 digits in this case), helping to ensure the lexical sort puts them +in the correct order. + +=head3 nsort_by BLOCK LIST + +Similar to sort_by but compares its key values numerically. + +=head3 qsort BLOCK ARRAY + +This sorts the given array B using the given compare code. Except for +tiny compare code like C<< $a <=> $b >>, qsort is much faster than Perl's C +depending on the version. + +Compared 5.8 and 5.26: + + my @rl; + for(my $i = 0; $i < 1E6; ++$i) { push @rl, rand(1E5) } + my $idx; + + sub ext_cmp { $_[0] <=> $_[1] } + + cmpthese( -60, { + 'qsort' => sub { + my @qrl = @rl; + qsort { ext_cmp($a, $b) } @qrl; + $idx = bsearchidx { ext_cmp($_, $rl[0]) } @qrl + }, + 'reverse qsort' => sub { + my @qrl = @rl; + qsort { ext_cmp($b, $a) } @qrl; + $idx = bsearchidx { ext_cmp($rl[0], $_) } @qrl + }, + 'sort' => sub { + my @srl = @rl; + @srl = sort { ext_cmp($a, $b) } @srl; + $idx = bsearchidx { ext_cmp($_, $rl[0]) } @srl + }, + 'reverse sort' => sub { + my @srl = @rl; + @srl = sort { ext_cmp($b, $a) } @srl; + $idx = bsearchidx { ext_cmp($rl[0], $_) } @srl + }, + }); + +5.8 results + + s/iter reverse sort sort reverse qsort qsort + reverse sort 6.21 -- -0% -8% -10% + sort 6.19 0% -- -7% -10% + reverse qsort 5.73 8% 8% -- -2% + qsort 5.60 11% 11% 2% -- + +5.26 results + + s/iter reverse sort sort reverse qsort qsort + reverse sort 4.54 -- -0% -96% -96% + sort 4.52 0% -- -96% -96% + reverse qsort 0.203 2139% 2131% -- -19% + qsort 0.164 2666% 2656% 24% -- + +Use it where external data sources might have to be compared (think of L +"tables"). + +C is available from List::MoreUtils::XS only. It's insane to maintain +a wrapper around Perl's sort nor having a pure Perl implementation. One could +create a flip-book in same speed as PP runs a qsort. + +=head2 Searching in sorted Lists + +=head3 bsearch BLOCK LIST + +Performs a binary search on LIST which must be a sorted list of values. BLOCK +must return a negative value if the current element (stored in C<$_>) is smaller, +a positive value if it is bigger and zero if it matches. + +Returns a boolean value in scalar context. In list context, it returns the element +if it was found, otherwise the empty list. + +=head3 bsearchidx BLOCK LIST + +=head3 bsearch_index BLOCK LIST + +Performs a binary search on LIST which must be a sorted list of values. BLOCK +must return a negative value if the current element (stored in C<$_>) is smaller, +a positive value if it is bigger and zero if it matches. + +Returns the index of found element, otherwise C<-1>. + +C is an alias for C. + +=head3 lower_bound BLOCK LIST + +Returns the index of the first element in LIST which does not compare +I. Technically it's the first element in LIST which does +not return a value below zero when passed to BLOCK. + + @ids = (1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 7, 7, 7, 8, 8, 9, 9, 9, 9, 9, 11, 13, 13, 13, 17); + $lb = lower_bound { $_ <=> 2 } @ids; # returns 2 + $lb = lower_bound { $_ <=> 4 } @ids; # returns 10 + +lower_bound has a complexity of O(log n). + +=head3 upper_bound BLOCK LIST + +Returns the index of the first element in LIST which does not compare +I. Technically it's the first element in LIST which does +not return a value below or equal to zero when passed to BLOCK. + + @ids = (1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 7, 7, 7, 8, 8, 9, 9, 9, 9, 9, 11, 13, 13, 13, 17); + $lb = upper_bound { $_ <=> 2 } @ids; # returns 4 + $lb = upper_bound { $_ <=> 4 } @ids; # returns 14 + +upper_bound has a complexity of O(log n). + +=head3 equal_range BLOCK LIST + +Returns a pair of indices containing the lower_bound and the upper_bound. + +=head2 Operations on sorted Lists + +=head3 binsert BLOCK ITEM LIST + +=head3 bsearch_insert BLOCK ITEM LIST + +Performs a binary search on LIST which must be a sorted list of values. BLOCK +must return a negative value if the current element (stored in C<$_>) is smaller, +a positive value if it is bigger and zero if it matches. + +ITEM is inserted at the index where the ITEM should be placed (based on above +search). That means, it's inserted before the next bigger element. + + @l = (2,3,5,7); + binsert { $_ <=> 4 } 4, @l; # @l = (2,3,4,5,7) + binsert { $_ <=> 6 } 42, @l; # @l = (2,3,4,42,7) + +You take care that the inserted element matches the compare result. + +=head3 bremove BLOCK LIST + +=head3 bsearch_remove BLOCK LIST + +Performs a binary search on LIST which must be a sorted list of values. BLOCK +must return a negative value if the current element (stored in C<$_>) is smaller, +a positive value if it is bigger and zero if it matches. + +The item at the found position is removed and returned. + + @l = (2,3,4,5,7); + bremove { $_ <=> 4 }, @l; # @l = (2,3,5,7); + +=head2 Counting and calculation + +=head3 true BLOCK LIST + +Counts the number of elements in LIST for which the criterion in BLOCK is true. +Sets C<$_> for each item in LIST in turn: + + printf "%i item(s) are defined", true { defined($_) } @list; + +=head3 false BLOCK LIST + +Counts the number of elements in LIST for which the criterion in BLOCK is false. +Sets C<$_> for each item in LIST in turn: + + printf "%i item(s) are not defined", false { defined($_) } @list; + +=head3 reduce_0 BLOCK LIST + +Reduce LIST by calling BLOCK in scalar context for each element of LIST. +C<$a> contains the progressional result and is initialized with 0. +C<$b> contains the current processed element of LIST and C<$_> contains the +index of the element in C<$b>. + +The idea behind reduce_0 is B (addition of a sequence of numbers). + +=head3 reduce_1 BLOCK LIST + +Reduce LIST by calling BLOCK in scalar context for each element of LIST. +C<$a> contains the progressional result and is initialized with 1. +C<$b> contains the current processed element of LIST and C<$_> contains the +index of the element in C<$b>. + +The idea behind reduce_1 is product of a sequence of numbers. + +=head3 reduce_u BLOCK LIST + +Reduce LIST by calling BLOCK in scalar context for each element of LIST. +C<$a> contains the progressional result and is uninitialized. +C<$b> contains the current processed element of LIST and C<$_> contains the +index of the element in C<$b>. + +This function has been added if one might need the extra of the index +value but need an individual initialization. + +B: In most cases L will do the +job better. + +=head3 minmax LIST + +Calculates the minimum and maximum of LIST and returns a two element list with +the first element being the minimum and the second the maximum. Returns the +empty list if LIST was empty. + +The C algorithm differs from a naive iteration over the list where each +element is compared to two values being the so far calculated min and max value +in that it only requires 3n/2 - 2 comparisons. Thus it is the most efficient +possible algorithm. + +However, the Perl implementation of it has some overhead simply due to the fact +that there are more lines of Perl code involved. Therefore, LIST needs to be +fairly big in order for C to win over a naive implementation. This +limitation does not apply to the XS version. + +=head3 minmaxstr LIST + +Computes the minimum and maximum of LIST using string compare and returns a +two element list with the first element being the minimum and the second the +maximum. Returns the empty list if LIST was empty. + +The implementation is similar to C. + +=head1 ENVIRONMENT + +When C is set, the module will always use the pure-Perl +implementation and not the XS one. This environment variable is really just +there for the test-suite to force testing the Perl implementation, and possibly +for reporting of bugs. I don't see any reason to use it in a production +environment. + +=head1 MAINTENANCE + +The maintenance goal is to preserve the documented semantics of the API; +bug fixes that bring actual behavior in line with semantics are allowed. +New API functions may be added over time. If a backwards incompatible +change is unavoidable, we will attempt to provide support for the legacy +API using the same export tag mechanism currently in place. + +This module attempts to use few non-core dependencies. Non-core +configuration and testing modules will be bundled when reasonable; +run-time dependencies will be added only if they deliver substantial +benefit. + +=head1 CONTRIBUTING + +While contributions are appreciated, a contribution should not cause more +effort for the maintainer than the contribution itself saves (see +L). + +To get more familiar where help could be needed - see L. + +=head1 BUGS + +There is a problem with a bug in 5.6.x perls. It is a syntax error to write +things like: + + my @x = apply { s/foo/bar/ } qw{ foo bar baz }; + +It has to be written as either + + my @x = apply { s/foo/bar/ } 'foo', 'bar', 'baz'; + +or + + my @x = apply { s/foo/bar/ } my @dummy = qw/foo bar baz/; + +Perl 5.5.x and Perl 5.8.x don't suffer from this limitation. + +If you have a functionality that you could imagine being in this module, please +drop me a line. This module's policy will be less strict than L's +when it comes to additions as it isn't a core module. + +When you report bugs, it would be nice if you could additionally give me the +output of your program with the environment variable C set +to a true value. That way I know where to look for the problem (in XS, +pure-Perl or possibly both). + +=head1 SUPPORT + +Bugs should always be submitted via the CPAN bug tracker. + +You can find documentation for this module with the perldoc command. + + perldoc List::MoreUtils + +You can also look for information at: + +=over 4 + +=item * RT: CPAN's request tracker + +L + +=item * AnnoCPAN: Annotated CPAN documentation + +L + +=item * CPAN Ratings + +L + +=item * MetaCPAN + +L + +=item * CPAN Search + +L + +=item * Git Repository + +L + +=back + +=head2 Where can I go for help? + +If you have a bug report, a patch or a suggestion, please open a new +report ticket at CPAN (but please check previous reports first in case +your issue has already been addressed) or open an issue on GitHub. + +Report tickets should contain a detailed description of the bug or +enhancement request and at least an easily verifiable way of +reproducing the issue or fix. Patches are always welcome, too - and +it's cheap to send pull-requests on GitHub. Please keep in mind that +code changes are more likely accepted when they're bundled with an +approving test. + +If you think you've found a bug then please read +"How to Report Bugs Effectively" by Simon Tatham: +L. + +=head2 Where can I go for help with a concrete version? + +Bugs and feature requests are accepted against the latest version +only. To get patches for earlier versions, you need to get an +agreement with a developer of your choice - who may or not report the +issue and a suggested fix upstream (depends on the license you have +chosen). + +=head2 Business support and maintenance + +Generally, in volunteered projects, there is no right for support. +While every maintainer is happy to improve the provided software, +spare time is limited. + +For those who have a use case which requires guaranteed support, one of +the maintainers should be hired or contracted. For business support you +can contact Jens via his CPAN email address rehsackATcpan.org. Please +keep in mind that business support is neither available for free nor +are you eligible to receive any support based on the license distributed +with this package. + +=head1 THANKS + +=head2 Tassilo von Parseval + +Credits go to a number of people: Steve Purkis for giving me namespace advice +and James Keenan and Terrence Branno for their effort of keeping the CPAN +tidier by making L obsolete. + +Brian McCauley suggested the inclusion of apply() and provided the pure-Perl +implementation for it. + +Eric J. Roode asked me to add all functions from his module C +into this one. With minor modifications, the pure-Perl implementations of those +are by him. + +The bunch of people who almost immediately pointed out the many problems with +the glitchy 0.07 release (Slaven Rezic, Ron Savage, CPAN testers). + +A particularly nasty memory leak was spotted by Thomas A. Lowery. + +Lars Thegler made me aware of problems with older Perl versions. + +Anno Siegel de-orphaned each_arrayref(). + +David Filmer made me aware of a problem in each_arrayref that could ultimately +lead to a segfault. + +Ricardo Signes suggested the inclusion of part() and provided the +Perl-implementation. + +Robin Huston kindly fixed a bug in perl's MULTICALL API to make the +XS-implementation of part() work. + +=head2 Jens Rehsack + +Credits goes to all people contributing feedback during the v0.400 +development releases. + +Special thanks goes to David Golden who spent a lot of effort to develop +a design to support current state of CPAN as well as ancient software +somewhere in the dark. He also contributed a lot of patches to refactor +the API frontend to welcome any user of List::MoreUtils - from ancient +past to recently last used. + +Toby Inkster provided a lot of useful feedback for sane importer code +and was a nice sounding board for API discussions. + +Peter Rabbitson provided a sane git repository setup containing entire +package history. + +=head1 TODO + +A pile of requests from other people is still pending further processing in +my mailbox. This includes: + +=over 4 + +=item * delete_index + +=item * random_item + +=item * random_item_delete_index + +=item * list_diff_hash + +=item * list_diff_inboth + +=item * list_diff_infirst + +=item * list_diff_insecond + +These were all suggested by Dan Muey. + +=item * listify + +Always return a flat list when either a simple scalar value was passed or an +array-reference. Suggested by Mark Summersault. + +=back + +=head1 SEE ALSO + +L, L, L + +=head1 AUTHOR + +Jens Rehsack Erehsack AT cpan.orgE + +Adam Kennedy Eadamk@cpan.orgE + +Tassilo von Parseval Etassilo.von.parseval@rwth-aachen.deE + +=head1 COPYRIGHT AND LICENSE + +Some parts copyright 2011 Aaron Crane. + +Copyright 2004 - 2010 by Tassilo von Parseval + +Copyright 2013 - 2017 by Jens Rehsack + +All code added with 0.417 or later is licensed under the Apache License, +Version 2.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +All code until 0.416 is licensed under the same terms as Perl itself, +either Perl version 5.8.4 or, at your option, any later version of +Perl 5 you may have available. + +=cut + +1; diff --git a/src/modules/List/MoreUtils/Contributing.pod b/src/modules/List/MoreUtils/Contributing.pod new file mode 100644 index 0000000..11b7996 --- /dev/null +++ b/src/modules/List/MoreUtils/Contributing.pod @@ -0,0 +1,100 @@ +=head1 NAME + +List::MoreUtils::Contributing - Gives rough introduction into contributing to List::MoreUtils + +=head1 DESCRIPTION + +List::Moreutils has a turbulent history and a strong approach. Before +going further, please step to +L +and then come back. + +The current distribution is a balance between finishing the history and +claiming for future requirements. Therefore some components will receive +a rewrite on purpose - others won't. + +For the moment - it's not the primary goal to clean up the configuration +stage, until the primary goals and prerequisites are done. + +To contribute to List::MoreUtils, one has to arrange with the current +situation, dig into details and ask for clarifying when parts are +incomprehensible. + +=head2 Primary Goals + +The very first primary goal is to clear the backlog. These are primarily +the open issues, feature requests and missing infrastructure elements. + +As example see RT#93207 or RT#75672 for missing configure time checks, +while RT#93207 radiates until test - but doesn't affect runtime nor +installation (beside test failures). + +=head2 Secondary Goals + +Secondary goals are harmonizing the function names and calling convention +(see RT#102673), tidying the infrastructure of the distribution and remove +unnecessary complexity (while protecting the necessary). + +One example of removing unnecessary infrastructure could be to move +L and L into authoring mode, when +improved test for RT#93207 could be reasonably done by a module which +is recommended for test. The recommendation of +L +in L a desirable +one. + +=head2 Orientation Guide + +List::MoreUtils configuration stage heavily depends on L +and L. A few prerequisites of both modules aren't available +for Perl 5.6 - which leads to a tiny emulation layer t the begin of +C. + +The reason for L is quite simple - the opportunities +for checking the environment cover a much wider range than a simple test +whether there is a working compiler. It requires a lot of improvements +since its base L was never designed to support +that kind of solutions - but there is I. To finally +solve issues as RT#75672 even in cross-compile environments - there is +no way around such a checking tool. + +The reason for L in combination with L +are extensible tests with reasonable effort and easy figuring out which +extra condition causes failures. Also - missing pre-conditions should +result in failing tests i some cases - what is fully supported by the +logic behind L in combination with L. + +Finally - L glues the stuff in a bundle together to allow +people with older toolchains to use List::MoreUtils out of the box (maybe +with reduced quantity but full quality). + +=head1 SEE ALSO + +L, L, L, +L + +=head1 AUTHOR + +Jens Rehsack Erehsack AT cpan.orgE + +=head1 COPYRIGHT AND LICENSE + +Copyright 2015-2017 by Jens Rehsack + +All code added with 0.417 or later is licensed under the Apache License, +Version 2.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +All code until 0.416 is licensed under the same terms as Perl itself, +either Perl version 5.8.4 or, at your option, any later version of +Perl 5 you may have available. + +=cut diff --git a/src/modules/List/MoreUtils/PP.pm b/src/modules/List/MoreUtils/PP.pm new file mode 100644 index 0000000..320635b --- /dev/null +++ b/src/modules/List/MoreUtils/PP.pm @@ -0,0 +1,953 @@ +package List::MoreUtils::PP; + +use 5.008_001; +use strict; +use warnings; + +our $VERSION = '0.430'; + +=pod + +=head1 NAME + +List::MoreUtils::PP - Provide List::MoreUtils pure Perl implementation + +=head1 SYNOPSIS + + BEGIN { $ENV{LIST_MOREUTILS_PP} = 1; } + use List::MoreUtils qw(:all); + +=cut + +## no critic (Subroutines::ProhibitSubroutinePrototypes, Subroutines::RequireArgUnpacking) +## no critic (Subroutines::ProhibitManyArgs) + +sub any (&@) +{ + my $f = shift; + foreach (@_) + { + return 1 if $f->(); + } + return 0; +} + +sub all (&@) +{ + my $f = shift; + foreach (@_) + { + return 0 unless $f->(); + } + return 1; +} + +sub none (&@) +{ + my $f = shift; + foreach (@_) + { + return 0 if $f->(); + } + return 1; +} + +sub notall (&@) +{ + my $f = shift; + foreach (@_) + { + return 1 unless $f->(); + } + return 0; +} + +sub one (&@) +{ + my $f = shift; + my $found = 0; + foreach (@_) + { + $f->() and $found++ and return 0; + } + return $found; +} + +sub any_u (&@) +{ + my $f = shift; + return if !@_; + $f->() and return 1 foreach (@_); + return 0; +} + +sub all_u (&@) +{ + my $f = shift; + return if !@_; + $f->() or return 0 foreach (@_); + return 1; +} + +sub none_u (&@) +{ + my $f = shift; + return if !@_; + $f->() and return 0 foreach (@_); + return 1; +} + +sub notall_u (&@) +{ + my $f = shift; + return if !@_; + $f->() or return 1 foreach (@_); + return 0; +} + +sub one_u (&@) +{ + my $f = shift; + return if !@_; + my $found = 0; + foreach (@_) + { + $f->() and $found++ and return 0; + } + return $found; +} + +sub reduce_u(&@) +{ + my $code = shift; + + # Localise $a, $b + my ($caller_a, $caller_b) = do + { + my $pkg = caller(); + ## no critic (TestingAndDebugging::ProhibitNoStrict, ValuesAndExpressions::ProhibitCommaSeparatedStatements) + no strict 'refs'; + \*{$pkg . '::a'}, \*{$pkg . '::b'}; + }; + + ## no critic (Variables::RequireInitializationForLocalVars) + local (*$caller_a, *$caller_b); + *$caller_a = \(); + for (0 .. $#_) + { + *$caller_b = \$_[$_]; + *$caller_a = \($code->()); + } + + return ${*$caller_a}; +} + +sub reduce_0(&@) +{ + my $code = shift; + + # Localise $a, $b + my ($caller_a, $caller_b) = do + { + my $pkg = caller(); + ## no critic (TestingAndDebugging::ProhibitNoStrict, ValuesAndExpressions::ProhibitCommaSeparatedStatements) + no strict 'refs'; + \*{$pkg . '::a'}, \*{$pkg . '::b'}; + }; + + ## no critic (Variables::RequireInitializationForLocalVars) + local (*$caller_a, *$caller_b); + *$caller_a = \0; + for (0 .. $#_) + { + *$caller_b = \$_[$_]; + *$caller_a = \($code->()); + } + + return ${*$caller_a}; +} + +sub reduce_1(&@) +{ + my $code = shift; + + # Localise $a, $b + my ($caller_a, $caller_b) = do + { + my $pkg = caller(); + ## no critic (TestingAndDebugging::ProhibitNoStrict, ValuesAndExpressions::ProhibitCommaSeparatedStatements) + no strict 'refs'; + \*{$pkg . '::a'}, \*{$pkg . '::b'}; + }; + + ## no critic (Variables::RequireInitializationForLocalVars) + local (*$caller_a, *$caller_b); + *$caller_a = \1; + for (0 .. $#_) + { + *$caller_b = \$_[$_]; + *$caller_a = \($code->()); + } + + return ${*$caller_a}; +} + +sub true (&@) +{ + my $f = shift; + my $count = 0; + $f->() and ++$count foreach (@_); + return $count; +} + +sub false (&@) +{ + my $f = shift; + my $count = 0; + $f->() or ++$count foreach (@_); + return $count; +} + +sub firstidx (&@) +{ + my $f = shift; + foreach my $i (0 .. $#_) + { + local *_ = \$_[$i]; + return $i if $f->(); + } + return -1; +} + +sub firstval (&@) +{ + my $test = shift; + foreach (@_) + { + return $_ if $test->(); + } + ## no critic (Subroutines::ProhibitExplicitReturnUndef) + return undef; +} + +sub firstres (&@) +{ + my $test = shift; + foreach (@_) + { + my $testval = $test->(); + $testval and return $testval; + } + ## no critic (Subroutines::ProhibitExplicitReturnUndef) + return undef; +} + +sub onlyidx (&@) +{ + my $f = shift; + my $found; + foreach my $i (0 .. $#_) + { + local *_ = \$_[$i]; + $f->() or next; + defined $found and return -1; + $found = $i; + } + return defined $found ? $found : -1; +} + +sub onlyval (&@) +{ + my $test = shift; + my $result = undef; + my $found = 0; + foreach (@_) + { + $test->() or next; + $result = $_; + ## no critic (Subroutines::ProhibitExplicitReturnUndef) + $found++ and return undef; + } + return $result; +} + +sub onlyres (&@) +{ + my $test = shift; + my $result = undef; + my $found = 0; + foreach (@_) + { + my $rv = $test->() or next; + $result = $rv; + ## no critic (Subroutines::ProhibitExplicitReturnUndef) + $found++ and return undef; + } + return $found ? $result : undef; +} + +sub lastidx (&@) +{ + my $f = shift; + foreach my $i (reverse 0 .. $#_) + { + local *_ = \$_[$i]; + return $i if $f->(); + } + return -1; +} + +sub lastval (&@) +{ + my $test = shift; + my $ix; + for ($ix = $#_; $ix >= 0; $ix--) + { + local *_ = \$_[$ix]; + my $testval = $test->(); + + # Simulate $_ as alias + $_[$ix] = $_; + return $_ if $testval; + } + ## no critic (Subroutines::ProhibitExplicitReturnUndef) + return undef; +} + +sub lastres (&@) +{ + my $test = shift; + my $ix; + for ($ix = $#_; $ix >= 0; $ix--) + { + local *_ = \$_[$ix]; + my $testval = $test->(); + + # Simulate $_ as alias + $_[$ix] = $_; + return $testval if $testval; + } + ## no critic (Subroutines::ProhibitExplicitReturnUndef) + return undef; +} + +sub insert_after (&$\@) +{ + my ($f, $val, $list) = @_; + my $c = &firstidx($f, @$list); + @$list = (@{$list}[0 .. $c], $val, @{$list}[$c + 1 .. $#$list],) and return 1 if $c != -1; + return 0; +} + +sub insert_after_string ($$\@) +{ + my ($string, $val, $list) = @_; + my $c = firstidx { defined $_ and $string eq $_ } @$list; + @$list = (@{$list}[0 .. $c], $val, @{$list}[$c + 1 .. $#$list],) and return 1 if $c != -1; + return 0; +} + +sub apply (&@) +{ + my $action = shift; + &$action foreach my @values = @_; + return wantarray ? @values : $values[-1]; +} + +sub after (&@) +{ + my $test = shift; + my $started; + my $lag; + ## no critic (BuiltinFunctions::RequireBlockGrep) + return grep $started ||= do + { + my $x = $lag; + $lag = $test->(); + $x; + }, @_; +} + +sub after_incl (&@) +{ + my $test = shift; + my $started; + return grep { $started ||= $test->() } @_; +} + +sub before (&@) +{ + my $test = shift; + my $more = 1; + return grep { $more &&= !$test->() } @_; +} + +sub before_incl (&@) +{ + my $test = shift; + my $more = 1; + my $lag = 1; + ## no critic (BuiltinFunctions::RequireBlockGrep) + return grep $more &&= do + { + my $x = $lag; + $lag = !$test->(); + $x; + }, @_; +} + +sub indexes (&@) +{ + my $test = shift; + return grep { + local *_ = \$_[$_]; + $test->() + } 0 .. $#_; +} + +sub pairwise (&\@\@) +{ + my $op = shift; + + # Symbols for caller's input arrays + use vars qw{ @A @B }; + local (*A, *B) = @_; + + # Localise $a, $b + my ($caller_a, $caller_b) = do + { + my $pkg = caller(); + ## no critic (TestingAndDebugging::ProhibitNoStrict, ValuesAndExpressions::ProhibitCommaSeparatedStatements) + no strict 'refs'; + \*{$pkg . '::a'}, \*{$pkg . '::b'}; + }; + + # Loop iteration limit + my $limit = $#A > $#B ? $#A : $#B; + + ## no critic (Variables::RequireInitializationForLocalVars) + # This map expression is also the return value + local (*$caller_a, *$caller_b); + ## no critic (BuiltinFunctions::ProhibitComplexMappings) + return map { + # Assign to $a, $b as refs to caller's array elements + (*$caller_a, *$caller_b) = \($#A < $_ ? undef : $A[$_], $#B < $_ ? undef : $B[$_]); + + # Perform the transformation + $op->(); + } 0 .. $limit; +} + +sub each_array (\@;\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@) +{ + return each_arrayref(@_); +} + +sub each_arrayref +{ + my @list = @_; # The list of references to the arrays + my $index = 0; # Which one the caller will get next + my $max = 0; # Number of elements in longest array + + # Get the length of the longest input array + foreach (@list) + { + unless (ref $_ eq 'ARRAY') + { + require Carp; + Carp::croak("each_arrayref: argument is not an array reference\n"); + } + $max = @$_ if @$_ > $max; + } + + # Return the iterator as a closure wrt the above variables. + return sub { + if (@_) + { + my $method = shift; + unless ($method eq 'index') + { + require Carp; + Carp::croak("each_array: unknown argument '$method' passed to iterator."); + } + + ## no critic (Subroutines::ProhibitExplicitReturnUndef) + return undef if $index == 0 || $index > $max; + # Return current (last fetched) index + return $index - 1; + } + + # No more elements to return + return if $index >= $max; + my $i = $index++; + + # Return ith elements + ## no critic (BuiltinFunctions::RequireBlockMap) + return map $_->[$i], @list; + } +} + +sub natatime ($@) +{ + my $n = shift; + my @list = @_; + return sub { return splice @list, 0, $n; } +} + +# "leaks" when lexically hidden in arrayify +my $flatten; +$flatten = sub { + return map { (ref $_ and ("ARRAY" eq ref $_ or overload::Method($_, '@{}'))) ? ($flatten->(@{$_})) : ($_) } @_; +}; + +sub arrayify +{ + return map { $flatten->($_) } @_; +} + +sub mesh (\@\@;\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@) +{ + my $max = -1; + $max < $#$_ && ($max = $#$_) foreach @_; + ## no critic (BuiltinFunctions::ProhibitComplexMappings) + return map { + my $ix = $_; + ## no critic (BuiltinFunctions::RequireBlockMap) + map $_->[$ix], @_; + } 0 .. $max; +} + +sub zip6 (\@\@;\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@) +{ + my $max = -1; + $max < $#$_ && ($max = $#$_) foreach @_; + ## no critic (BuiltinFunctions::ProhibitComplexMappings) + return map { + my $ix = $_; + ## no critic (BuiltinFunctions::RequireBlockMap) + [map $_->[$ix], @_]; + } 0 .. $max; +} + +sub listcmp (\@\@;\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@\@) +{ + my %ret; + for (my $i = 0; $i < scalar @_; ++$i) + { + my %seen; + my $k; + foreach my $w (grep { defined $_ and not $seen{$k = $_}++ } @{$_[$i]}) + { + $ret{$w} ||= []; + push @{$ret{$w}}, $i; + } + } + return %ret; +} + +sub uniq (@) +{ + my %seen = (); + my $k; + my $seen_undef; + return grep { defined $_ ? not $seen{$k = $_}++ : not $seen_undef++ } @_; +} + +sub singleton (@) +{ + my %seen = (); + my $k; + my $seen_undef; + return grep { 1 == (defined $_ ? $seen{$k = $_} : $seen_undef) } + grep { defined $_ ? not $seen{$k = $_}++ : not $seen_undef++ } @_; +} + +sub duplicates (@) +{ + my %seen = (); + my $k; + my $seen_undef; + return grep { 1 < (defined $_ ? $seen{$k = $_} : $seen_undef) } + grep { defined $_ ? not $seen{$k = $_}++ : not $seen_undef++ } @_; +} + +sub frequency (@) +{ + my %seen = (); + my $k; + my $seen_undef; + my %h = map { defined $_ ? ($_ => $seen{$k = $_}) : () } + grep { defined $_ ? not $seen{$k = $_}++ : not $seen_undef++ } @_; + wantarray or return (scalar keys %h) + ($seen_undef ? 1 : 0); + undef $k; + return (%h, $seen_undef ? (\$k => $seen_undef) : ()); +} + +sub occurrences (@) +{ + my %seen = (); + my $k; + my $seen_undef; + my @ret; + foreach my $l (map { $_ } grep { defined $_ ? not $seen{$k = $_}++ : not $seen_undef++ } @_) + { + my $n = defined $l ? $seen{$l} : $seen_undef; + defined $ret[$n] or $ret[$n] = []; + push @{$ret[$n]}, $l; + } + return @ret; +} + +sub mode (@) +{ + my %seen = (); + my ($max, $k, $seen_undef) = (1); + + foreach (@_) { defined $_ ? ($max < ++$seen{$k = $_} and ++$max) : ($max < ++$seen_undef and ++$max) } + wantarray or return $max; + + my @ret = ($max); + foreach my $l (grep { $seen{$_} == $max } keys %seen) + { + push @ret, $l; + } + $seen_undef and $seen_undef == $max and push @ret, undef; + return @ret; +} + +sub samples ($@) +{ + my $n = shift; + if ($n > @_) + { + require Carp; + Carp::croak(sprintf("Cannot get %d samples from %d elements", $n, scalar @_)); + } + + for (my $i = @_; @_ - $i > $n;) + { + my $idx = @_ - $i; + my $swp = $idx + int(rand(--$i)); + my $xchg = $_[$swp]; + $_[$swp] = $_[$idx]; + $_[$idx] = $xchg; + } + + return splice @_, 0, $n; +} + +sub minmax (@) +{ + return unless @_; + my $min = my $max = $_[0]; + + for (my $i = 1; $i < @_; $i += 2) + { + if ($_[$i - 1] <= $_[$i]) + { + $min = $_[$i - 1] if $min > $_[$i - 1]; + $max = $_[$i] if $max < $_[$i]; + } + else + { + $min = $_[$i] if $min > $_[$i]; + $max = $_[$i - 1] if $max < $_[$i - 1]; + } + } + + if (@_ & 1) + { + my $i = $#_; + if ($_[$i - 1] <= $_[$i]) + { + $min = $_[$i - 1] if $min > $_[$i - 1]; + $max = $_[$i] if $max < $_[$i]; + } + else + { + $min = $_[$i] if $min > $_[$i]; + $max = $_[$i - 1] if $max < $_[$i - 1]; + } + } + + return ($min, $max); +} + +sub minmaxstr (@) +{ + return unless @_; + my $min = my $max = $_[0]; + + for (my $i = 1; $i < @_; $i += 2) + { + if ($_[$i - 1] le $_[$i]) + { + $min = $_[$i - 1] if $min gt $_[$i - 1]; + $max = $_[$i] if $max lt $_[$i]; + } + else + { + $min = $_[$i] if $min gt $_[$i]; + $max = $_[$i - 1] if $max lt $_[$i - 1]; + } + } + + if (@_ & 1) + { + my $i = $#_; + if ($_[$i - 1] le $_[$i]) + { + $min = $_[$i - 1] if $min gt $_[$i - 1]; + $max = $_[$i] if $max lt $_[$i]; + } + else + { + $min = $_[$i] if $min gt $_[$i]; + $max = $_[$i - 1] if $max lt $_[$i - 1]; + } + } + + return ($min, $max); +} + +sub part (&@) +{ + my ($code, @list) = @_; + my @parts; + push @{$parts[$code->($_)]}, $_ foreach @list; + return @parts; +} + +sub bsearch(&@) +{ + my $code = shift; + + my $rc; + my $i = 0; + my $j = @_; + ## no critic (ControlStructures::ProhibitNegativeExpressionsInUnlessAndUntilConditions) + do + { + my $k = int(($i + $j) / 2); + + $k >= @_ and return; + + local *_ = \$_[$k]; + $rc = $code->(); + + $rc == 0 + and return wantarray ? $_ : 1; + + if ($rc < 0) + { + $i = $k + 1; + } + else + { + $j = $k - 1; + } + } until $i > $j; + + return; +} + +sub bsearchidx(&@) +{ + my $code = shift; + + my $rc; + my $i = 0; + my $j = @_; + ## no critic (ControlStructures::ProhibitNegativeExpressionsInUnlessAndUntilConditions) + do + { + my $k = int(($i + $j) / 2); + + $k >= @_ and return -1; + + local *_ = \$_[$k]; + $rc = $code->(); + + $rc == 0 and return $k; + + if ($rc < 0) + { + $i = $k + 1; + } + else + { + $j = $k - 1; + } + } until $i > $j; + + return -1; +} + +sub lower_bound(&@) +{ + my $code = shift; + my $count = @_; + my $first = 0; + while ($count > 0) + { + my $step = $count >> 1; + my $it = $first + $step; + local *_ = \$_[$it]; + if ($code->() < 0) + { + $first = ++$it; + $count -= $step + 1; + } + else + { + $count = $step; + } + } + + return $first; +} + +sub upper_bound(&@) +{ + my $code = shift; + my $count = @_; + my $first = 0; + while ($count > 0) + { + my $step = $count >> 1; + my $it = $first + $step; + local *_ = \$_[$it]; + if ($code->() <= 0) + { + $first = ++$it; + $count -= $step + 1; + } + else + { + $count = $step; + } + } + + return $first; +} + +sub equal_range(&@) +{ + my $lb = &lower_bound(@_); + my $ub = &upper_bound(@_); + return ($lb, $ub); +} + +sub binsert (&$\@) +{ + my $lb = &lower_bound($_[0], @{$_[2]}); + splice @{$_[2]}, $lb, 0, $_[1]; + return $lb; +} + +sub bremove (&\@) +{ + my $lb = &lower_bound($_[0], @{$_[1]}); + return splice @{$_[1]}, $lb, 1; +} + +sub qsort(&\@) +{ + require Carp; + Carp::croak("It's insane to use a pure-perl qsort"); +} + +sub slide(&@) +{ + my $op = shift; + my @l = @_; + + ## no critic (TestingAndDebugging::ProhibitNoStrict, ValuesAndExpressions::ProhibitCommaSeparatedStatements) + # Localise $a, $b + my ($caller_a, $caller_b) = do + { + my $pkg = caller(); + no strict 'refs'; + \*{$pkg . '::a'}, \*{$pkg . '::b'}; + }; + + ## no critic (Variables::RequireInitializationForLocalVars) + # This map expression is also the return value + local (*$caller_a, *$caller_b); + ## no critic (BuiltinFunctions::ProhibitComplexMappings) + return map { + # Assign to $a, $b as refs to caller's array elements + (*$caller_a, *$caller_b) = \($l[$_], $l[$_ + 1]); + + # Perform the transformation + $op->(); + } 0 .. ($#l - 1); +} + +sub slideatatime ($$@) +{ + my ($m, $w, @list) = @_; + my $n = $w - $m - 1; + return $n >= 0 + ? sub { my @r = splice @list, 0, $m; $#list < $n and $n = $#list; @r and push @r, (@list ? @list[0 .. $n] : ()); return @r; } + : sub { return splice @list, 0, $m; }; +} + +sub sort_by(&@) +{ + my ($code, @list) = @_; + return map { $_->[0] } + sort { $a->[1] cmp $b->[1] } + map { [$_, scalar($code->())] } @list; +} + +sub nsort_by(&@) +{ + my ($code, @list) = @_; + return map { $_->[0] } + sort { $a->[1] <=> $b->[1] } + map { [$_, scalar($code->())] } @list; +} + +## no critic (Subroutines::ProhibitUnusedPrivateSubroutines) +sub _XScompiled { return 0 } + +=head1 SEE ALSO + +L + +=head1 AUTHOR + +Jens Rehsack Erehsack AT cpan.orgE + +Adam Kennedy Eadamk@cpan.orgE + +Tassilo von Parseval Etassilo.von.parseval@rwth-aachen.deE + +=head1 COPYRIGHT AND LICENSE + +Some parts copyright 2011 Aaron Crane. + +Copyright 2004 - 2010 by Tassilo von Parseval + +Copyright 2013 - 2017 by Jens Rehsack + +All code added with 0.417 or later is licensed under the Apache License, +Version 2.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +All code until 0.416 is licensed under the same terms as Perl itself, +either Perl version 5.8.4 or, at your option, any later version of +Perl 5 you may have available. + +=cut + +1; diff --git a/src/modules/Parallel/ForkManager.pm b/src/modules/Parallel/ForkManager.pm new file mode 100644 index 0000000..0018826 --- /dev/null +++ b/src/modules/Parallel/ForkManager.pm @@ -0,0 +1,852 @@ +package Parallel::ForkManager; +our $AUTHORITY = 'cpan:DLUX'; +# ABSTRACT: A simple parallel processing fork manager +$Parallel::ForkManager::VERSION = '1.18'; +use POSIX ":sys_wait_h"; +use Storable qw(store retrieve); +use File::Spec; +use File::Temp (); +use File::Path (); +use Carp; + +use strict; + +sub new { + my ($c,$processes,$tempdir)=@_; + + my $h={ + max_proc => $processes, + processes => {}, + in_child => 0, + parent_pid => $$, + auto_cleanup => ($tempdir ? 0 : 1), + waitpid_blocking_sleep => 1, + }; + + + # determine temporary directory for storing data structures + # add it to Parallel::ForkManager object so children can use it + # We don't let it clean up so it won't do it in the child process + # but we have our own DESTROY to do that. + if (not defined($tempdir) or not length($tempdir)) { + $tempdir = File::Temp::tempdir(CLEANUP => 0); + } + die qq|Temporary directory "$tempdir" doesn't exist or is not a directory.| unless (-e $tempdir && -d _); # ensure temp dir exists and is indeed a directory + $h->{tempdir} = $tempdir; + + return bless($h,ref($c)||$c); +}; + +sub start { + my ($s,$identification)=@_; + + die "Cannot start another process while you are in the child process" + if $s->{in_child}; + while ($s->{max_proc} && ( keys %{ $s->{processes} } ) >= $s->{max_proc}) { + $s->on_wait; + $s->wait_one_child(defined $s->{on_wait_period} ? &WNOHANG : undef); + }; + $s->wait_children; + if ($s->{max_proc}) { + my $pid=fork(); + die "Cannot fork: $!" if !defined $pid; + if ($pid) { + $s->{processes}->{$pid}=$identification; + $s->on_start($pid,$identification); + } else { + $s->{in_child}=1 if !$pid; + } + return $pid; + } else { + $s->{processes}->{$$}=$identification; + $s->on_start($$,$identification); + return 0; # Simulating the child which returns 0 + } +} + +sub finish { + my ($s, $x, $r)=@_; + + if ( $s->{in_child} ) { + if (defined($r)) { # store the child's data structure + my $storable_tempfile = File::Spec->catfile($s->{tempdir}, 'Parallel-ForkManager-' . $s->{parent_pid} . '-' . $$ . '.txt'); + my $stored = eval { return &store($r, $storable_tempfile); }; + + # handle Storables errors, IE logcarp or carp returning undef, or die (via logcroak or croak) + if (not $stored or $@) { + warn(qq|The storable module was unable to store the child's data structure to the temp file "$storable_tempfile": | . join(', ', $@)); + } + } + CORE::exit($x || 0); + } + if ($s->{max_proc} == 0) { # max_proc == 0 + $s->on_finish($$, $x ,$s->{processes}->{$$}, 0, 0, $r); + delete $s->{processes}->{$$}; + } + return 0; +} + +sub wait_children { + my ($s)=@_; + + return if !keys %{$s->{processes}}; + my $kid; + do { + $kid = $s->wait_one_child(&WNOHANG); + } while defined $kid and ( $kid > 0 or $kid < -1 ); # AS 5.6/Win32 returns negative PIDs +}; + +*wait_childs=*wait_children; # compatibility +*reap_finished_children=*wait_children; # behavioral synonym for clarity + +sub wait_one_child { + my ($s,$par)=@_; + + my $kid; + while (1) { + $kid = $s->_waitpid(-1,$par||=0); + + last unless defined $kid; + + last if $kid == 0 || $kid == -1; # AS 5.6/Win32 returns negative PIDs + redo if !exists $s->{processes}->{$kid}; + my $id = delete $s->{processes}->{$kid}; + + # retrieve child data structure, if any + my $retrieved = undef; + my $storable_tempfile = File::Spec->catfile($s->{tempdir}, 'Parallel-ForkManager-' . $s->{parent_pid} . '-' . $kid . '.txt'); + if (-e $storable_tempfile) { # child has option of not storing anything, so we need to see if it did or not + $retrieved = eval { return &retrieve($storable_tempfile); }; + + # handle Storables errors + if (not $retrieved or $@) { + warn(qq|The storable module was unable to retrieve the child's data structure from the temporary file "$storable_tempfile": | . join(', ', $@)); + } + + # clean up after ourselves + unlink $storable_tempfile; + } + + $s->on_finish( $kid, $? >> 8 , $id, $? & 0x7f, $? & 0x80 ? 1 : 0, $retrieved); + last; + } + $kid; +}; + +sub wait_all_children { + my ($s)=@_; + + while (keys %{ $s->{processes} }) { + $s->on_wait; + $s->wait_one_child(defined $s->{on_wait_period} ? &WNOHANG : undef); + }; +} + +*wait_all_childs=*wait_all_children; # compatibility; + +sub max_procs { $_[0]->{max_proc}; } + +sub is_child { $_[0]->{in_child} } + +sub is_parent { !$_[0]->{in_child} } + +sub running_procs { + my $self = shift; + + my @pids = keys %{ $self->{processes} }; + return @pids; +} + +sub wait_for_available_procs { + my( $self, $nbr ) = @_; + $nbr ||= 1; + + croak "nbr processes '$nbr' higher than the max nbr of processes (@{[ $self->max_procs ]})" + if $nbr > $self->max_procs; + + $self->wait_one_child until $self->max_procs - $self->running_procs >= $nbr; +} + +sub run_on_finish { + my ($s,$code,$pid)=@_; + + $s->{on_finish}->{$pid || 0}=$code; +} + +sub on_finish { + my ($s,$pid,@par)=@_; + + my $code=$s->{on_finish}->{$pid} || $s->{on_finish}->{0} or return 0; + $code->($pid,@par); +}; + +sub run_on_wait { + my ($s,$code, $period)=@_; + + $s->{on_wait}=$code; + $s->{on_wait_period} = $period; +} + +sub on_wait { + my ($s)=@_; + + if(ref($s->{on_wait}) eq 'CODE') { + $s->{on_wait}->(); + if (defined $s->{on_wait_period}) { + local $SIG{CHLD} = sub { } if ! defined $SIG{CHLD}; + select undef, undef, undef, $s->{on_wait_period} + }; + }; +}; + +sub run_on_start { + my ($s,$code)=@_; + + $s->{on_start}=$code; +} + +sub on_start { + my ($s,@par)=@_; + + $s->{on_start}->(@par) if ref($s->{on_start}) eq 'CODE'; +}; + +sub set_max_procs { + my ($s, $mp)=@_; + + $s->{max_proc} = $mp; +} + +sub set_waitpid_blocking_sleep { + my( $self, $period ) = @_; + $self->{waitpid_blocking_sleep} = $period; +} + +sub waitpid_blocking_sleep { + $_[0]->{waitpid_blocking_sleep}; +} + +sub _waitpid { # Call waitpid() in the standard Unix fashion. + my( $self, undef, $flag ) = @_; + + return $flag ? $self->_waitpid_non_blocking : $self->_waitpid_blocking; +} + +sub _waitpid_non_blocking { + my $self = shift; + + for my $pid ( $self->running_procs ) { + my $p = waitpid $pid, &WNOHANG or next; + + return $pid if $p != -1; + + warn "child process '$pid' disappeared. A call to `waitpid` outside of Parallel::ForkManager might have reaped it.\n"; + # it's gone. let's clean the process entry + delete $self->{processes}{$pid}; + } + + return; +} + +sub _waitpid_blocking { + my $self = shift; + + # pseudo-blocking + if( my $sleep_period = $self->{waitpid_blocking_sleep} ) { + while() { + my $pid = $self->_waitpid_non_blocking; + + return $pid if defined $pid; + + return unless $self->running_procs; + + select undef, undef, undef, $sleep_period; + } + } + + return waitpid -1, 0; +} + +sub DESTROY { + my ($self) = @_; + + if ($self->{auto_cleanup} && $self->{parent_pid} == $$ && -d $self->{tempdir}) { + File::Path::remove_tree($self->{tempdir}); + } +} + +1; + +__END__ + +=pod + +=encoding UTF-8 + +=head1 NAME + +Parallel::ForkManager - A simple parallel processing fork manager + +=head1 VERSION + +version 1.18 + +=head1 SYNOPSIS + + use Parallel::ForkManager; + + my $pm = Parallel::ForkManager->new($MAX_PROCESSES); + + DATA_LOOP: + foreach my $data (@all_data) { + # Forks and returns the pid for the child: + my $pid = $pm->start and next DATA_LOOP; + + ... do some work with $data in the child process ... + + $pm->finish; # Terminates the child process + } + +=head1 DESCRIPTION + +This module is intended for use in operations that can be done in parallel +where the number of processes to be forked off should be limited. Typical +use is a downloader which will be retrieving hundreds/thousands of files. + +The code for a downloader would look something like this: + + use LWP::Simple; + use Parallel::ForkManager; + + ... + + my @links=( + ["http://www.foo.bar/rulez.data","rulez_data.txt"], + ["http://new.host/more_data.doc","more_data.doc"], + ... + ); + + ... + + # Max 30 processes for parallel download + my $pm = Parallel::ForkManager->new(30); + + LINKS: + foreach my $linkarray (@links) { + $pm->start and next LINKS; # do the fork + + my ($link, $fn) = @$linkarray; + warn "Cannot get $fn from $link" + if getstore($link, $fn) != RC_OK; + + $pm->finish; # do the exit in the child process + } + $pm->wait_all_children; + +First you need to instantiate the ForkManager with the "new" constructor. +You must specify the maximum number of processes to be created. If you +specify 0, then NO fork will be done; this is good for debugging purposes. + +Next, use $pm->start to do the fork. $pm returns 0 for the child process, +and child pid for the parent process (see also L). +The "and next" skips the internal loop in the parent process. NOTE: +$pm->start dies if the fork fails. + +$pm->finish terminates the child process (assuming a fork was done in the +"start"). + +NOTE: You cannot use $pm->start if you are already in the child process. +If you want to manage another set of subprocesses in the child process, +you must instantiate another Parallel::ForkManager object! + +=head1 METHODS + +The comment letter indicates where the method should be run. P for parent, +C for child. + +=over 5 + +=item new $processes + +Instantiate a new Parallel::ForkManager object. You must specify the maximum +number of children to fork off. If you specify 0 (zero), then no children +will be forked. This is intended for debugging purposes. + +The optional second parameter, $tempdir, is only used if you want the +children to send back a reference to some data (see RETRIEVING DATASTRUCTURES +below). If not provided, it is set via a call to L::tempdir(). + +The new method will die if the temporary directory does not exist or it is not +a directory. + +=item start [ $process_identifier ] + +This method does the fork. It returns the pid of the child process for +the parent, and 0 for the child process. If the $processes parameter +for the constructor is 0 then, assuming you're in the child process, +$pm->start simply returns 0. + +An optional $process_identifier can be provided to this method... It is used by +the "run_on_finish" callback (see CALLBACKS) for identifying the finished +process. + +=item finish [ $exit_code [, $data_structure_reference] ] + +Closes the child process by exiting and accepts an optional exit code +(default exit code is 0) which can be retrieved in the parent via callback. +If the second optional parameter is provided, the child attempts to send +it's contents back to the parent. If you use the program in debug mode +($processes == 0), this method just calls the callback. + +If the $data_structure_reference is provided, then it is serialized and +passed to the parent process. See RETRIEVING DATASTRUCTURES for more info. + +=item set_max_procs $processes + +Allows you to set a new maximum number of children to maintain. + +=item wait_all_children + +You can call this method to wait for all the processes which have been +forked. This is a blocking wait. + +=item reap_finished_children + +This is a non-blocking call to reap children and execute callbacks independent +of calls to "start" or "wait_all_children". Use this in scenarios where "start" +is called infrequently but you would like the callbacks executed quickly. + +=item is_parent + +Returns C if within the parent or C if within the child. + +=item is_child + +Returns C if within the child or C if within the parent. + +=item max_procs + +Returns the maximal number of processes the object will fork. + +=item running_procs + +Returns the pids of the forked processes currently monitored by the +C. Note that children are still reported as running +until the fork manager harvest them, via the next call to +C or C. + + my @pids = $pm->running_procs; + + my $nbr_children =- $pm->running_procs; + +=item wait_for_available_procs( $n ) + +Wait until C<$n> available process slots are available. +If C<$n> is not given, defaults to I<1>. + +=item waitpid_blocking_sleep + +Returns the sleep period, in seconds, of the pseudo-blocking calls. The sleep +period can be a fraction of second. + +Returns C<0> if disabled. + +Defaults to 1 second. + +See I for more details. + +=item set_waitpid_blocking_sleep $seconds + +Sets the the sleep period, in seconds, of the pseudo-blocking calls. +Set to C<0> to disable. + +See I for more details. + +=back + +=head1 CALLBACKS + +You can define callbacks in the code, which are called on events like starting +a process or upon finish. Declare these before the first call to start(). + +The callbacks can be defined with the following methods: + +=over 4 + +=item run_on_finish $code [, $pid ] + +You can define a subroutine which is called when a child is terminated. It is +called in the parent process. + +The parameters of the $code are the following: + + - pid of the process, which is terminated + - exit code of the program + - identification of the process (if provided in the "start" method) + - exit signal (0-127: signal name) + - core dump (1 if there was core dump at exit) + - datastructure reference or undef (see RETRIEVING DATASTRUCTURES) + +=item run_on_start $code + +You can define a subroutine which is called when a child is started. It called +after the successful startup of a child in the parent process. + +The parameters of the $code are the following: + + - pid of the process which has been started + - identification of the process (if provided in the "start" method) + +=item run_on_wait $code, [$period] + +You can define a subroutine which is called when the child process needs to wait +for the startup. If $period is not defined, then one call is done per +child. If $period is defined, then $code is called periodically and the +module waits for $period seconds between the two calls. Note, $period can be +fractional number also. The exact "$period seconds" is not guaranteed, +signals can shorten and the process scheduler can make it longer (on busy +systems). + +The $code called in the "start" and the "wait_all_children" method also. + +No parameters are passed to the $code on the call. + +=back + +=head1 BLOCKING CALLS + +When it comes to waiting for child processes to terminate, C is between +a fork and a hard place (if you excuse the terrible pun). The underlying Perl C function +that the module relies on can block until either one specific or any child process +terminate, but not for a process part of a given group. + +This means that the module can do one of two things when it waits for +one of its child processes to terminate: + +=over + +=item Only wait for its own child processes + +This is done via a loop using a C non-blocking call and a sleep statement. +The code does something along the lines of + + while(1) { + if ( any of the P::FM child process terminated ) { + return its pid + } + + sleep $sleep_period + } + +This is the default behavior that the module will use. +This is not the most efficient way to wait for child processes, but it's +the safest way to ensure that C won't interfere with +any other part of the codebase. + +The sleep period is set via the method C. + +=item Block until any process terminate + +Alternatively, C can call C such that it will +block until any child process terminate. If the child process was not one of +the monitored subprocesses, the wait will resume. This is more efficient, but mean +that C can captures (and discards) the termination notification that a different +part of the code might be waiting for. + +If this is a race condition +that doesn't apply to your codebase, you can set the +I period to C<0>, which will enable C call blocking. + + my $pm = Parallel::ForkManager->new( 4 ); + + $pm->set_waitpid_blocking_sleep(0); # true blocking calls enabled + + for ( 1..100 ) { + $pm->start and next; + + ...; # do work + + $pm->finish; + } + +=back + +=head1 RETRIEVING DATASTRUCTURES from child processes + +The ability for the parent to retrieve data structures is new as of version +0.7.6. + +Each child process may optionally send 1 data structure back to the parent. +By data structure, we mean a reference to a string, hash or array. The +contents of the data structure are written out to temporary files on disc +using the L modules' store() method. The reference is then +retrieved from within the code you send to the run_on_finish callback. + +The data structure can be any scalar perl data structure which makes sense: +string, numeric value or a reference to an array, hash or object. + +There are 2 steps involved in retrieving data structures: + +1) A reference to the data structure the child wishes to send back to the +parent is provided as the second argument to the finish() call. It is up +to the child to decide whether or not to send anything back to the parent. + +2) The data structure reference is retrieved using the callback provided in +the run_on_finish() method. + +Keep in mind that data structure retrieval is not the same as returning a +data structure from a method call. That is not what actually occurs. The +data structure referenced in a given child process is serialized and +written out to a file by L. The file is subsequently read back +into memory and a new data structure belonging to the parent process is +created. Please consider the performance penality it can imply, so try to +keep the returned structure small. + +=head1 EXAMPLES + +=head2 Parallel get + +This small example can be used to get URLs in parallel. + + use Parallel::ForkManager; + use LWP::Simple; + + my $pm = Parallel::ForkManager->new(10); + + LINKS: + for my $link (@ARGV) { + $pm->start and next LINKS; + my ($fn) = $link =~ /^.*\/(.*?)$/; + if (!$fn) { + warn "Cannot determine filename from $fn\n"; + } else { + $0 .= " " . $fn; + print "Getting $fn from $link\n"; + my $rc = getstore($link, $fn); + print "$link downloaded. response code: $rc\n"; + }; + $pm->finish; + }; + +=head2 Callbacks + +Example of a program using callbacks to get child exit codes: + + use strict; + use Parallel::ForkManager; + + my $max_procs = 5; + my @names = qw( Fred Jim Lily Steve Jessica Bob Dave Christine Rico Sara ); + # hash to resolve PID's back to child specific information + + my $pm = Parallel::ForkManager->new($max_procs); + + # Setup a callback for when a child finishes up so we can + # get it's exit code + $pm->run_on_finish( sub { + my ($pid, $exit_code, $ident) = @_; + print "** $ident just got out of the pool ". + "with PID $pid and exit code: $exit_code\n"; + }); + + $pm->run_on_start( sub { + my ($pid, $ident)=@_; + print "** $ident started, pid: $pid\n"; + }); + + $pm->run_on_wait( sub { + print "** Have to wait for one children ...\n" + }, + 0.5 + ); + + NAMES: + foreach my $child ( 0 .. $#names ) { + my $pid = $pm->start($names[$child]) and next NAMES; + + # This code is the child process + print "This is $names[$child], Child number $child\n"; + sleep ( 2 * $child ); + print "$names[$child], Child $child is about to get out...\n"; + sleep 1; + $pm->finish($child); # pass an exit code to finish + } + + print "Waiting for Children...\n"; + $pm->wait_all_children; + print "Everybody is out of the pool!\n"; + +=head2 Data structure retrieval + +In this simple example, each child sends back a string reference. + + use Parallel::ForkManager 0.7.6; + use strict; + + my $pm = Parallel::ForkManager->new(2, '/server/path/to/temp/dir/'); + + # data structure retrieval and handling + $pm -> run_on_finish ( # called BEFORE the first call to start() + sub { + my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data_structure_reference) = @_; + + # retrieve data structure from child + if (defined($data_structure_reference)) { # children are not forced to send anything + my $string = ${$data_structure_reference}; # child passed a string reference + print "$string\n"; + } + else { # problems occuring during storage or retrieval will throw a warning + print qq|No message received from child process $pid!\n|; + } + } + ); + + # prep random statement components + my @foods = ('chocolate', 'ice cream', 'peanut butter', 'pickles', 'pizza', 'bacon', 'pancakes', 'spaghetti', 'cookies'); + my @preferences = ('loves', q|can't stand|, 'always wants more', 'will walk 100 miles for', 'only eats', 'would starve rather than eat'); + + # run the parallel processes + PERSONS: + foreach my $person (qw(Fred Wilma Ernie Bert Lucy Ethel Curly Moe Larry)) { + $pm->start() and next PERSONS; + + # generate a random statement about food preferences + my $statement = $person . ' ' . $preferences[int(rand @preferences)] . ' ' . $foods[int(rand @foods)]; + + # send it back to the parent process + $pm->finish(0, \$statement); # note that it's a scalar REFERENCE, not the scalar itself + } + $pm->wait_all_children; + +A second datastructure retrieval example demonstrates how children decide +whether or not to send anything back, what to send and how the parent should +process whatever is retrieved. + +=for example begin + + use Parallel::ForkManager 0.7.6; + use Data::Dumper; # to display the data structures retrieved. + use strict; + + my $pm = Parallel::ForkManager->new(20); # using the system temp dir $L run_on_finish ( + sub { + my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data_structure_reference) = @_; + + # see what the child sent us, if anything + if (defined($data_structure_reference)) { # test rather than assume child sent anything + my $reftype = ref($data_structure_reference); + print qq|ident "$ident" returned a "$reftype" reference.\n\n|; + if (1) { # simple on/off switch to display the contents + print &Dumper($data_structure_reference) . qq|end of "$ident" sent structure\n\n|; + } + + # we can also collect retrieved data structures for processing after all children have exited + $retrieved_responses{$ident} = $data_structure_reference; + } else { + print qq|ident "$ident" did not send anything.\n\n|; + } + } + ); + + # generate a list of instructions + my @instructions = ( # a unique identifier and what the child process should send + {'name' => '%ENV keys as a string', 'send' => 'keys'}, + {'name' => 'Send Nothing'}, # not instructing the child to send anything back to the parent + {'name' => 'Childs %ENV', 'send' => 'all'}, + {'name' => 'Child chooses randomly', 'send' => 'random'}, + {'name' => 'Invalid send instructions', 'send' => 'Na Na Nana Na'}, + {'name' => 'ENV values in an array', 'send' => 'values'}, + ); + + INSTRUCTS: + foreach my $instruction (@instructions) { + $pm->start($instruction->{'name'}) and next INSTRUCTS; # this time we are using an explicit, unique child process identifier + + # last step in child processing + $pm->finish(0) unless $instruction->{'send'}; # no data structure is sent unless this child is told what to send. + + if ($instruction->{'send'} eq 'keys') { + $pm->finish(0, \join(', ', keys %ENV)); + + } elsif ($instruction->{'send'} eq 'values') { + $pm->finish(0, [values %ENV]); # kinda useless without knowing which keys they belong to... + + } elsif ($instruction->{'send'} eq 'all') { + $pm->finish(0, \%ENV); # remember, we are not "returning" anything, just copying the hash to disc + + # demonstrate clearly that the child determines what type of reference to send + } elsif ($instruction->{'send'} eq 'random') { + my $string = q|I'm just a string.|; + my @array = qw(I am an array); + my %hash = (type => 'associative array', synonym => 'hash', cool => 'very :)'); + my $return_choice = ('string', 'array', 'hash')[int(rand 3)]; # randomly choose return data type + $pm->finish(0, \$string) if ($return_choice eq 'string'); + $pm->finish(0, \@array) if ($return_choice eq 'array'); + $pm->finish(0, \%hash) if ($return_choice eq 'hash'); + + # as a responsible child, inform parent that their instruction was invalid + } else { + $pm->finish(0, \qq|Invalid instructions: "$instruction->{'send'}".|); # ordinarily I wouldn't include invalid input in a response... + } + } + $pm->wait_all_children; # blocks until all forked processes have exited + + # post fork processing of returned data structures + for (sort keys %retrieved_responses) { + print qq|Post processing "$_"...\n|; + } + +=for example end + +=head1 BUGS AND LIMITATIONS + +Do not use Parallel::ForkManager in an environment, where other child +processes can affect the run of the main program, so using this module +is not recommended in an environment where fork() / wait() is already used. + +If you want to use more than one copies of the Parallel::ForkManager, then +you have to make sure that all children processes are terminated, before you +use the second object in the main program. + +You are free to use a new copy of Parallel::ForkManager in the child +processes, although I don't think it makes sense. + +=head1 CREDITS + + Michael Gang (bug report) + Noah Robin (documentation tweaks) + Chuck Hirstius (callback exit status, example) + Grant Hopwood (win32 port) + Mark Southern (bugfix) + Ken Clarke (datastructure retrieval) + +=head1 AUTHORS + +=over 4 + +=item * + +dLux (Szabó, Balázs) + +=item * + +Yanick Champoux + +=item * + +Gabor Szabo + +=back + +=head1 COPYRIGHT AND LICENSE + +This software is copyright (c) 2000 by Balázs Szabó. + +This is free software; you can redistribute it and/or modify it under +the same terms as the Perl 5 programming language system itself. + +=cut diff --git a/src/modules/Switch.pm b/src/modules/Switch.pm new file mode 100644 index 0000000..8c2544e --- /dev/null +++ b/src/modules/Switch.pm @@ -0,0 +1,888 @@ +package Switch; + +use 5.005; +use strict; +use vars qw($VERSION); +use Carp; + +use if $] >= 5.011, 'deprecate'; + +$VERSION = '2.17'; + + +# LOAD FILTERING MODULE... +use Filter::Util::Call; + +sub __(); + +# CATCH ATTEMPTS TO CALL case OUTSIDE THE SCOPE OF ANY switch + +$::_S_W_I_T_C_H = sub { croak "case/when statement not in switch/given block" }; + +my $offset; +my $fallthrough; +my ($Perl5, $Perl6) = (0,0); + +sub import +{ + $fallthrough = grep /\bfallthrough\b/, @_; + $offset = (caller)[2]+1; + filter_add({}) unless @_>1 && $_[1] eq 'noimport'; + my $pkg = caller; + no strict 'refs'; + for ( qw( on_defined on_exists ) ) + { + *{"${pkg}::$_"} = \&$_; + } + *{"${pkg}::__"} = \&__ if grep /__/, @_; + $Perl6 = 1 if grep(/Perl\s*6/i, @_); + $Perl5 = 1 if grep(/Perl\s*5/i, @_) || !grep(/Perl\s*6/i, @_); + 1; +} + +sub unimport +{ + filter_del() +} + +sub filter +{ + my($self) = @_ ; + local $Switch::file = (caller)[1]; + + my $status = 1; + $status = filter_read(1_000_000); + return $status if $status<0; + $_ = filter_blocks($_,$offset); + $_ = "# line $offset\n" . $_ if $offset; undef $offset; + return $status; +} + +use Text::Balanced ':ALL'; + +sub line +{ + my ($pretext,$offset) = @_; + ($pretext=~tr/\n/\n/)+($offset||0); +} + +sub is_block +{ + local $SIG{__WARN__}=sub{die$@}; + local $^W=1; + my $ishash = defined eval 'my $hr='.$_[0]; + undef $@; + return !$ishash; +} + +my $pod_or_DATA = qr/ ^=[A-Za-z] .*? ^=cut (?![A-Za-z]) .*? $ + | ^__(DATA|END)__\n.* + /smx; + +my $casecounter = 1; +sub filter_blocks +{ + my ($source, $line) = @_; + return $source unless $Perl5 && $source =~ /case|switch/ + || $Perl6 && $source =~ /when|given|default/; + pos $source = 0; + my $text = ""; + component: while (pos $source < length $source) + { + if ($source =~ m/(\G\s*use\s+Switch\b)/gc) + { + $text .= q{use Switch 'noimport'}; + next component; + } + my @pos = Text::Balanced::_match_quotelike(\$source,qr/\s*/,1,0); + if (defined $pos[0]) + { + my $pre = substr($source,$pos[0],$pos[1]); # matched prefix + my $iEol; + if( substr($source,$pos[4],$pos[5]) eq '/' && # 1st delimiter + substr($source,$pos[2],$pos[3]) eq '' && # no op like 'm' + index( substr($source,$pos[16],$pos[17]), 'x' ) == -1 && # no //x + ($iEol = index( $source, "\n", $pos[4] )) > 0 && + $iEol < $pos[8] ){ # embedded newlines + # If this is a pattern, it isn't compatible with Switch. Backup past 1st '/'. + pos( $source ) = $pos[6]; + $text .= $pre . substr($source,$pos[2],$pos[6]-$pos[2]); + } else { + $text .= $pre . substr($source,$pos[2],$pos[18]-$pos[2]); + } + next component; + } + if ($source =~ m/(\G\s*$pod_or_DATA)/gc) { + $text .= $1; + next component; + } + @pos = Text::Balanced::_match_variable(\$source,qr/\s*/); + if (defined $pos[0]) + { + $text .= " " if $pos[0] < $pos[2]; + $text .= substr($source,$pos[0],$pos[4]-$pos[0]); + next component; + } + + if ($Perl5 && $source =~ m/\G(\n*)(\s*)(switch)\b(?=\s*[(])/gc + || $Perl6 && $source =~ m/\G(\n*)(\s*)(given)\b(?=\s*[(])/gc + || $Perl6 && $source =~ m/\G(\n*)(\s*)(given)\b(.*)(?=\{)/gc) + { + my $keyword = $3; + my $arg = $4; + $text .= $1.$2.'S_W_I_T_C_H: while (1) '; + unless ($arg) { + @pos = Text::Balanced::_match_codeblock(\$source,qr/\s*/,qr/\(/,qr/\)/,qr/[[{(<]/,qr/[]})>]/,undef) + or do { + die "Bad $keyword statement (problem in the parentheses?) near $Switch::file line ", line(substr($source,0,pos $source),$line), "\n"; + }; + $arg = filter_blocks(substr($source,$pos[0],$pos[4]-$pos[0]),line(substr($source,0,$pos[0]),$line)); + } + $arg =~ s {^\s*[(]\s*%} { ( \\\%} || + $arg =~ s {^\s*[(]\s*m\b} { ( qr} || + $arg =~ s {^\s*[(]\s*/} { ( qr/} || + $arg =~ s {^\s*[(]\s*qw} { ( \\qw}; + @pos = Text::Balanced::_match_codeblock(\$source,qr/\s*/,qr/\{/,qr/\}/,qr/\{/,qr/\}/,undef) + or do { + die "Bad $keyword statement (problem in the code block?) near $Switch::file line ", line(substr($source,0, pos $source), $line), "\n"; + }; + my $code = filter_blocks(substr($source,$pos[0],$pos[4]-$pos[0]),line(substr($source,0,$pos[0]),$line)); + $code =~ s/{/{ local \$::_S_W_I_T_C_H; Switch::switch($arg);/; + $text .= $code . 'continue {last}'; + next component; + } + elsif ($Perl5 && $source =~ m/\G(\s*)(case\b)(?!\s*=>)/gc + || $Perl6 && $source =~ m/\G(\s*)(when\b)(?!\s*=>)/gc + || $Perl6 && $source =~ m/\G(\s*)(default\b)(?=\s*\{)/gc) + { + my $keyword = $2; + $text .= $1 . ($keyword eq "default" + ? "if (1)" + : "if (Switch::case"); + + if ($keyword eq "default") { + # Nothing to do + } + elsif (@pos = Text::Balanced::_match_codeblock(\$source,qr/\s*/,qr/\{/,qr/\}/,qr/\{/,qr/\}/,undef)) { + my $code = substr($source,$pos[0],$pos[4]-$pos[0]); + $text .= " " if $pos[0] < $pos[2]; + $text .= "sub " if is_block $code; + $text .= filter_blocks($code,line(substr($source,0,$pos[0]),$line)) . ")"; + } + elsif (@pos = Text::Balanced::_match_codeblock(\$source,qr/\s*/,qr/[[(]/,qr/[])]/,qr/[[({]/,qr/[])}]/,undef)) { + my $code = filter_blocks(substr($source,$pos[0],$pos[4]-$pos[0]),line(substr($source,0,$pos[0]),$line)); + $code =~ s {^\s*[(]\s*%} { ( \\\%} || + $code =~ s {^\s*[(]\s*m\b} { ( qr} || + $code =~ s {^\s*[(]\s*/} { ( qr/} || + $code =~ s {^\s*[(]\s*qw} { ( \\qw}; + $text .= " " if $pos[0] < $pos[2]; + $text .= "$code)"; + } + elsif ($Perl6 && do{@pos = Text::Balanced::_match_variable(\$source,qr/\s*/)}) { + my $code = filter_blocks(substr($source,$pos[0],$pos[4]-$pos[0]),line(substr($source,0,$pos[0]),$line)); + $code =~ s {^\s*%} { \%} || + $code =~ s {^\s*@} { \@}; + $text .= " " if $pos[0] < $pos[2]; + $text .= "$code)"; + } + elsif ( @pos = Text::Balanced::_match_quotelike(\$source,qr/\s*/,1,0)) { + my $code = substr($source,$pos[2],$pos[18]-$pos[2]); + $code = filter_blocks($code,line(substr($source,0,$pos[2]),$line)); + $code =~ s {^\s*m} { qr} || + $code =~ s {^\s*/} { qr/} || + $code =~ s {^\s*qw} { \\qw}; + $text .= " " if $pos[0] < $pos[2]; + $text .= "$code)"; + } + elsif ($Perl5 && $source =~ m/\G\s*(([^\$\@{])[^\$\@{]*)(?=\s*{)/gc + || $Perl6 && $source =~ m/\G\s*([^;{]*)()/gc) { + my $code = filter_blocks($1,line(substr($source,0,pos $source),$line)); + $text .= ' \\' if $2 eq '%'; + $text .= " $code)"; + } + else { + die "Bad $keyword statement (invalid $keyword value?) near $Switch::file line ", line(substr($source,0,pos $source), $line), "\n"; + } + + die "Missing opening brace or semi-colon after 'when' value near $Switch::file line ", line(substr($source,0,pos $source), $line), "\n" + unless !$Perl6 || $source =~ m/\G(\s*)(?=;|\{)/gc; + + do{@pos = Text::Balanced::_match_codeblock(\$source,qr/\s*/,qr/\{/,qr/\}/,qr/\{/,qr/\}/,undef)} + or do { + if ($source =~ m/\G\s*(?=([};]|\Z))/gc) { + $casecounter++; + next component; + } + die "Bad $keyword statement (problem in the code block?) near $Switch::file line ", line(substr($source,0,pos $source),$line), "\n"; + }; + my $code = filter_blocks(substr($source,$pos[0],$pos[4]-$pos[0]),line(substr($source,0,$pos[0]),$line)); + $code =~ s/}(?=\s*\Z)/;last S_W_I_T_C_H }/ + unless $fallthrough; + $text .= "{ while (1) $code continue { goto C_A_S_E_$casecounter } last S_W_I_T_C_H; C_A_S_E_$casecounter: }"; + $casecounter++; + next component; + } + + $source =~ m/\G(\s*(-[sm]\s+|\w+|#.*\n|\W))/gc; + $text .= $1; + } + $text; +} + + + +sub in +{ + my ($x,$y) = @_; + my @numy; + for my $nextx ( @$x ) + { + my $numx = ref($nextx) || defined $nextx && (~$nextx&$nextx) eq 0; + for my $j ( 0..$#$y ) + { + my $nexty = $y->[$j]; + push @numy, ref($nexty) || defined $nexty && (~$nexty&$nexty) eq 0 + if @numy <= $j; + return 1 if $numx && $numy[$j] && $nextx==$nexty + || $nextx eq $nexty; + + } + } + return ""; +} + +sub on_exists +{ + my $ref = @_==1 && ref($_[0]) eq 'HASH' ? $_[0] : { @_ }; + [ keys %$ref ] +} + +sub on_defined +{ + my $ref = @_==1 && ref($_[0]) eq 'HASH' ? $_[0] : { @_ }; + [ grep { defined $ref->{$_} } keys %$ref ] +} + +sub switch(;$) +{ + my ($s_val) = @_ ? $_[0] : $_; + my $s_ref = ref $s_val; + + if ($s_ref eq 'CODE') + { + $::_S_W_I_T_C_H = + sub { my $c_val = $_[0]; + return $s_val == $c_val if ref $c_val eq 'CODE'; + return $s_val->(@$c_val) if ref $c_val eq 'ARRAY'; + return $s_val->($c_val); + }; + } + elsif ($s_ref eq "" && defined $s_val && (~$s_val&$s_val) eq 0) # NUMERIC SCALAR + { + $::_S_W_I_T_C_H = + sub { my $c_val = $_[0]; + my $c_ref = ref $c_val; + return $s_val == $c_val if $c_ref eq "" + && defined $c_val + && (~$c_val&$c_val) eq 0; + return $s_val eq $c_val if $c_ref eq ""; + return in([$s_val],$c_val) if $c_ref eq 'ARRAY'; + return $c_val->($s_val) if $c_ref eq 'CODE'; + return $c_val->call($s_val) if $c_ref eq 'Switch'; + return scalar $s_val=~/$c_val/ + if $c_ref eq 'Regexp'; + return scalar $c_val->{$s_val} + if $c_ref eq 'HASH'; + return; + }; + } + elsif ($s_ref eq "") # STRING SCALAR + { + $::_S_W_I_T_C_H = + sub { my $c_val = $_[0]; + my $c_ref = ref $c_val; + return $s_val eq $c_val if $c_ref eq ""; + return in([$s_val],$c_val) if $c_ref eq 'ARRAY'; + return $c_val->($s_val) if $c_ref eq 'CODE'; + return $c_val->call($s_val) if $c_ref eq 'Switch'; + return scalar $s_val=~/$c_val/ + if $c_ref eq 'Regexp'; + return scalar $c_val->{$s_val} + if $c_ref eq 'HASH'; + return; + }; + } + elsif ($s_ref eq 'ARRAY') + { + $::_S_W_I_T_C_H = + sub { my $c_val = $_[0]; + my $c_ref = ref $c_val; + return in($s_val,[$c_val]) if $c_ref eq ""; + return in($s_val,$c_val) if $c_ref eq 'ARRAY'; + return $c_val->(@$s_val) if $c_ref eq 'CODE'; + return $c_val->call(@$s_val) + if $c_ref eq 'Switch'; + return scalar grep {$_=~/$c_val/} @$s_val + if $c_ref eq 'Regexp'; + return scalar grep {$c_val->{$_}} @$s_val + if $c_ref eq 'HASH'; + return; + }; + } + elsif ($s_ref eq 'Regexp') + { + $::_S_W_I_T_C_H = + sub { my $c_val = $_[0]; + my $c_ref = ref $c_val; + return $c_val=~/s_val/ if $c_ref eq ""; + return scalar grep {$_=~/s_val/} @$c_val + if $c_ref eq 'ARRAY'; + return $c_val->($s_val) if $c_ref eq 'CODE'; + return $c_val->call($s_val) if $c_ref eq 'Switch'; + return $s_val eq $c_val if $c_ref eq 'Regexp'; + return grep {$_=~/$s_val/ && $c_val->{$_}} keys %$c_val + if $c_ref eq 'HASH'; + return; + }; + } + elsif ($s_ref eq 'HASH') + { + $::_S_W_I_T_C_H = + sub { my $c_val = $_[0]; + my $c_ref = ref $c_val; + return $s_val->{$c_val} if $c_ref eq ""; + return scalar grep {$s_val->{$_}} @$c_val + if $c_ref eq 'ARRAY'; + return $c_val->($s_val) if $c_ref eq 'CODE'; + return $c_val->call($s_val) if $c_ref eq 'Switch'; + return grep {$_=~/$c_val/ && $s_val->{"$_"}} keys %$s_val + if $c_ref eq 'Regexp'; + return $s_val==$c_val if $c_ref eq 'HASH'; + return; + }; + } + elsif ($s_ref eq 'Switch') + { + $::_S_W_I_T_C_H = + sub { my $c_val = $_[0]; + return $s_val == $c_val if ref $c_val eq 'Switch'; + return $s_val->call(@$c_val) + if ref $c_val eq 'ARRAY'; + return $s_val->call($c_val); + }; + } + else + { + croak "Cannot switch on $s_ref"; + } + return 1; +} + +sub case($) { local $SIG{__WARN__} = \&carp; + $::_S_W_I_T_C_H->(@_); } + +# IMPLEMENT __ + +my $placeholder = bless { arity=>1, impl=>sub{$_[1+$_[0]]} }; + +sub __() { $placeholder } + +sub __arg($) +{ + my $index = $_[0]+1; + bless { arity=>0, impl=>sub{$_[$index]} }; +} + +sub hosub(&@) +{ + # WRITE THIS +} + +sub call +{ + my ($self,@args) = @_; + return $self->{impl}->(0,@args); +} + +sub meta_bop(&) +{ + my ($op) = @_; + sub + { + my ($left, $right, $reversed) = @_; + ($right,$left) = @_ if $reversed; + + my $rop = ref $right eq 'Switch' + ? $right + : bless { arity=>0, impl=>sub{$right} }; + + my $lop = ref $left eq 'Switch' + ? $left + : bless { arity=>0, impl=>sub{$left} }; + + my $arity = $lop->{arity} + $rop->{arity}; + + return bless { + arity => $arity, + impl => sub { my $start = shift; + return $op->($lop->{impl}->($start,@_), + $rop->{impl}->($start+$lop->{arity},@_)); + } + }; + }; +} + +sub meta_uop(&) +{ + my ($op) = @_; + sub + { + my ($left) = @_; + + my $lop = ref $left eq 'Switch' + ? $left + : bless { arity=>0, impl=>sub{$left} }; + + my $arity = $lop->{arity}; + + return bless { + arity => $arity, + impl => sub { $op->($lop->{impl}->(@_)) } + }; + }; +} + + +use overload + "+" => meta_bop {$_[0] + $_[1]}, + "-" => meta_bop {$_[0] - $_[1]}, + "*" => meta_bop {$_[0] * $_[1]}, + "/" => meta_bop {$_[0] / $_[1]}, + "%" => meta_bop {$_[0] % $_[1]}, + "**" => meta_bop {$_[0] ** $_[1]}, + "<<" => meta_bop {$_[0] << $_[1]}, + ">>" => meta_bop {$_[0] >> $_[1]}, + "x" => meta_bop {$_[0] x $_[1]}, + "." => meta_bop {$_[0] . $_[1]}, + "<" => meta_bop {$_[0] < $_[1]}, + "<=" => meta_bop {$_[0] <= $_[1]}, + ">" => meta_bop {$_[0] > $_[1]}, + ">=" => meta_bop {$_[0] >= $_[1]}, + "==" => meta_bop {$_[0] == $_[1]}, + "!=" => meta_bop {$_[0] != $_[1]}, + "<=>" => meta_bop {$_[0] <=> $_[1]}, + "lt" => meta_bop {$_[0] lt $_[1]}, + "le" => meta_bop {$_[0] le $_[1]}, + "gt" => meta_bop {$_[0] gt $_[1]}, + "ge" => meta_bop {$_[0] ge $_[1]}, + "eq" => meta_bop {$_[0] eq $_[1]}, + "ne" => meta_bop {$_[0] ne $_[1]}, + "cmp" => meta_bop {$_[0] cmp $_[1]}, + "\&" => meta_bop {$_[0] & $_[1]}, + "^" => meta_bop {$_[0] ^ $_[1]}, + "|" => meta_bop {$_[0] | $_[1]}, + "atan2" => meta_bop {atan2 $_[0], $_[1]}, + + "neg" => meta_uop {-$_[0]}, + "!" => meta_uop {!$_[0]}, + "~" => meta_uop {~$_[0]}, + "cos" => meta_uop {cos $_[0]}, + "sin" => meta_uop {sin $_[0]}, + "exp" => meta_uop {exp $_[0]}, + "abs" => meta_uop {abs $_[0]}, + "log" => meta_uop {log $_[0]}, + "sqrt" => meta_uop {sqrt $_[0]}, + "bool" => sub { croak "Can't use && or || in expression containing __" }, + + # "&()" => sub { $_[0]->{impl} }, + + # "||" => meta_bop {$_[0] || $_[1]}, + # "&&" => meta_bop {$_[0] && $_[1]}, + # fallback => 1, + ; +1; + +__END__ + + +=head1 NAME + +Switch - A switch statement for Perl, do not use if you can use given/when + +=head1 SYNOPSIS + + use Switch; + + switch ($val) { + case 1 { print "number 1" } + case "a" { print "string a" } + case [1..10,42] { print "number in list" } + case (\@array) { print "number in list" } + case /\w+/ { print "pattern" } + case qr/\w+/ { print "pattern" } + case (\%hash) { print "entry in hash" } + case (\&sub) { print "arg to subroutine" } + else { print "previous case not true" } + } + +=head1 BACKGROUND + +[Skip ahead to L<"DESCRIPTION"> if you don't care about the whys +and wherefores of this control structure] + +In seeking to devise a "Swiss Army" case mechanism suitable for Perl, +it is useful to generalize this notion of distributed conditional +testing as far as possible. Specifically, the concept of "matching" +between the switch value and the various case values need not be +restricted to numeric (or string or referential) equality, as it is in other +languages. Indeed, as Table 1 illustrates, Perl +offers at least eighteen different ways in which two values could +generate a match. + + Table 1: Matching a switch value ($s) with a case value ($c) + + Switch Case Type of Match Implied Matching Code + Value Value + ====== ===== ===================== ============= + + number same numeric or referential match if $s == $c; + or ref equality + + object method result of method call match if $s->$c(); + ref name match if defined $s->$c(); + or ref + + other other string equality match if $s eq $c; + non-ref non-ref + scalar scalar + + string regexp pattern match match if $s =~ /$c/; + + array scalar array entry existence match if 0<=$c && $c<@$s; + ref array entry definition match if defined $s->[$c]; + array entry truth match if $s->[$c]; + + array array array intersection match if intersects(@$s, @$c); + ref ref (apply this table to + all pairs of elements + $s->[$i] and + $c->[$j]) + + array regexp array grep match if grep /$c/, @$s; + ref + + hash scalar hash entry existence match if exists $s->{$c}; + ref hash entry definition match if defined $s->{$c}; + hash entry truth match if $s->{$c}; + + hash regexp hash grep match if grep /$c/, keys %$s; + ref + + sub scalar return value defn match if defined $s->($c); + ref return value truth match if $s->($c); + + sub array return value defn match if defined $s->(@$c); + ref ref return value truth match if $s->(@$c); + + +In reality, Table 1 covers 31 alternatives, because only the equality and +intersection tests are commutative; in all other cases, the roles of +the C<$s> and C<$c> variables could be reversed to produce a +different test. For example, instead of testing a single hash for +the existence of a series of keys (C{$c}>), +one could test for the existence of a single key in a series of hashes +(C{$s}>). + +=head1 DESCRIPTION + +The Switch.pm module implements a generalized case mechanism that covers +most (but not all) of the numerous possible combinations of switch and case +values described above. + +The module augments the standard Perl syntax with two new control +statements: C and C. The C statement takes a +single scalar argument of any type, specified in parentheses. +C stores this value as the +current switch value in a (localized) control variable. +The value is followed by a block which may contain one or more +Perl statements (including the C statement described below). +The block is unconditionally executed once the switch value has +been cached. + +A C statement takes a single scalar argument (in mandatory +parentheses if it's a variable; otherwise the parens are optional) and +selects the appropriate type of matching between that argument and the +current switch value. The type of matching used is determined by the +respective types of the switch value and the C argument, as +specified in Table 1. If the match is successful, the mandatory +block associated with the C statement is executed. + +In most other respects, the C statement is semantically identical +to an C statement. For example, it can be followed by an C +clause, and can be used as a postfix statement qualifier. + +However, when a C block has been executed control is automatically +transferred to the statement after the immediately enclosing C +block, rather than to the next statement within the block. In other +words, the success of any C statement prevents other cases in the +same scope from executing. But see L<"Allowing fall-through"> below. + +Together these two new statements provide a fully generalized case +mechanism: + + use Switch; + + # AND LATER... + + %special = ( woohoo => 1, d'oh => 1 ); + + while (<>) { + chomp; + switch ($_) { + case (%special) { print "homer\n"; } # if $special{$_} + case /[a-z]/i { print "alpha\n"; } # if $_ =~ /a-z/i + case [1..9] { print "small num\n"; } # if $_ in [1..9] + case { $_[0] >= 10 } { print "big num\n"; } # if $_ >= 10 + print "must be punctuation\n" case /\W/; # if $_ ~= /\W/ + } + } + +Note that Ces can be nested within C (or any other) blocks, +and a series of C statements can try different types of matches +-- hash membership, pattern match, array intersection, simple equality, +etc. -- against the same switch value. + +The use of intersection tests against an array reference is particularly +useful for aggregating integral cases: + + sub classify_digit + { + switch ($_[0]) { case 0 { return 'zero' } + case [2,4,6,8] { return 'even' } + case [1,3,5,7,9] { return 'odd' } + case /[A-F]/i { return 'hex' } + } + } + + +=head2 Allowing fall-through + +Fall-though (trying another case after one has already succeeded) +is usually a Bad Idea in a switch statement. However, this +is Perl, not a police state, so there I a way to do it, if you must. + +If a C block executes an untargeted C, control is +immediately transferred to the statement I the C statement +(i.e. usually another case), rather than out of the surrounding +C block. + +For example: + + switch ($val) { + case 1 { handle_num_1(); next } # and try next case... + case "1" { handle_str_1(); next } # and try next case... + case [0..9] { handle_num_any(); } # and we're done + case /\d/ { handle_dig_any(); next } # and try next case... + case /.*/ { handle_str_any(); next } # and try next case... + } + +If $val held the number C<1>, the above C block would call the +first three C subroutines, jumping to the next case test +each time it encountered a C. After the third C block +was executed, control would jump to the end of the enclosing +C block. + +On the other hand, if $val held C<10>, then only the last two C +subroutines would be called. + +Note that this mechanism allows the notion of I. +For example: + + switch ($val) { + case [0..9] { handle_num_any(); next if $val < 7; } + case /\d/ { handle_dig_any(); } + } + +If an untargeted C statement is executed in a case block, this +immediately transfers control out of the enclosing C block +(in other words, there is an implicit C at the end of each +normal C block). Thus the previous example could also have been +written: + + switch ($val) { + case [0..9] { handle_num_any(); last if $val >= 7; next; } + case /\d/ { handle_dig_any(); } + } + + +=head2 Automating fall-through + +In situations where case fall-through should be the norm, rather than an +exception, an endless succession of terminal Cs is tedious and ugly. +Hence, it is possible to reverse the default behaviour by specifying +the string "fallthrough" when importing the module. For example, the +following code is equivalent to the first example in L<"Allowing fall-through">: + + use Switch 'fallthrough'; + + switch ($val) { + case 1 { handle_num_1(); } + case "1" { handle_str_1(); } + case [0..9] { handle_num_any(); last } + case /\d/ { handle_dig_any(); } + case /.*/ { handle_str_any(); } + } + +Note the explicit use of a C to preserve the non-fall-through +behaviour of the third case. + + + +=head2 Alternative syntax + +Perl 6 will provide a built-in switch statement with essentially the +same semantics as those offered by Switch.pm, but with a different +pair of keywords. In Perl 6 C will be spelled C, and +C will be pronounced C. In addition, the C statement +will not require switch or case values to be parenthesized. + +This future syntax is also (largely) available via the Switch.pm module, by +importing it with the argument C<"Perl6">. For example: + + use Switch 'Perl6'; + + given ($val) { + when 1 { handle_num_1(); } + when ($str1) { handle_str_1(); } + when [0..9] { handle_num_any(); last } + when /\d/ { handle_dig_any(); } + when /.*/ { handle_str_any(); } + default { handle anything else; } + } + +Note that scalars still need to be parenthesized, since they would be +ambiguous in Perl 5. + +Note too that you can mix and match both syntaxes by importing the module +with: + + use Switch 'Perl5', 'Perl6'; + + +=head2 Higher-order Operations + +One situation in which C and C do not provide a good +substitute for a cascaded C, is where a switch value needs to +be tested against a series of conditions. For example: + + sub beverage { + switch (shift) { + case { $_[0] < 10 } { return 'milk' } + case { $_[0] < 20 } { return 'coke' } + case { $_[0] < 30 } { return 'beer' } + case { $_[0] < 40 } { return 'wine' } + case { $_[0] < 50 } { return 'malt' } + case { $_[0] < 60 } { return 'Moet' } + else { return 'milk' } + } + } + +(This is equivalent to writing C, etc.; C<$_[0]> +is the argument to the anonymous subroutine.) + +The need to specify each condition as a subroutine block is tiresome. To +overcome this, when importing Switch.pm, a special "placeholder" +subroutine named C<__> [sic] may also be imported. This subroutine +converts (almost) any expression in which it appears to a reference to a +higher-order function. That is, the expression: + + use Switch '__'; + + __ < 2 + +is equivalent to: + + sub { $_[0] < 2 } + +With C<__>, the previous ugly case statements can be rewritten: + + case __ < 10 { return 'milk' } + case __ < 20 { return 'coke' } + case __ < 30 { return 'beer' } + case __ < 40 { return 'wine' } + case __ < 50 { return 'malt' } + case __ < 60 { return 'Moet' } + else { return 'milk' } + +The C<__> subroutine makes extensive use of operator overloading to +perform its magic. All operations involving __ are overloaded to +produce an anonymous subroutine that implements a lazy version +of the original operation. + +The only problem is that operator overloading does not allow the +boolean operators C<&&> and C<||> to be overloaded. So a case statement +like this: + + case 0 <= __ && __ < 10 { return 'digit' } + +doesn't act as expected, because when it is +executed, it constructs two higher order subroutines +and then treats the two resulting references as arguments to C<&&>: + + sub { 0 <= $_[0] } && sub { $_[0] < 10 } + +This boolean expression is inevitably true, since both references are +non-false. Fortunately, the overloaded C<'bool'> operator catches this +situation and flags it as an error. + +=head1 DEPENDENCIES + +The module is implemented using Filter::Util::Call and Text::Balanced +and requires both these modules to be installed. + +=head1 AUTHOR + +Damian Conway (damian@conway.org). This module is now maintained by +Alexandr Ciornii (alexchorny@gmail.com). Previously was maintained by +Rafael Garcia-Suarez and perl5 porters. + +=head1 BUGS + +There are undoubtedly serious bugs lurking somewhere in code this funky :-) +Bug reports and other feedback are most welcome. + +May create syntax errors in other parts of code. + +On perl 5.10.x may cause syntax error if "case" is present inside heredoc. + +In general, use given/when instead. It were introduced in perl 5.10.0. +Perl 5.10.0 was released in 2007. + +=head1 LIMITATIONS + +Due to the heuristic nature of Switch.pm's source parsing, the presence of +regexes with embedded newlines that are specified with raw C +delimiters and don't have a modifier C are indistinguishable from +code chunks beginning with the division operator C. As a workaround +you must use C or C for such patterns. Also, the presence +of regexes specified with raw C delimiters may cause mysterious +errors. The workaround is to use C instead. + +Due to the way source filters work in Perl, you can't use Switch inside +an string C. + +May not work if sub prototypes are used (RT#33988). + +Regex captures in when are not available to code. + +If your source file is longer then 1 million characters and you have a +switch statement that crosses the 1 million (or 2 million, etc.) +character boundary you will get mysterious errors. The workaround is to +use smaller source files. + +=head1 COPYRIGHT + + Copyright (c) 1997-2008, Damian Conway. All Rights Reserved. + This module is free software. It may be used, redistributed + and/or modified under the same terms as Perl itself. + diff --git a/src/processfiles.py b/src/processfiles.py new file mode 100644 index 0000000..b11fc41 --- /dev/null +++ b/src/processfiles.py @@ -0,0 +1,1530 @@ +#!/usr/bin/python + +import os +import re +import subprocess +import pdb +import xml.etree.ElementTree as ET +import gzip +import wget +import pandas as pd +import itertools +import zipfile +from io import StringIO +import progressbar +import json +import tarfile +import scipy +from scipy import stats +import multiprocessing +from tqdm import tqdm +import numpy as np +import glob + +#Set to the established download directory +dwn_dir = 'downloads/' + +#Set to the desired processed files directory +pro_dir = 'processed/' + +def process_DGIdb(): + + print('Formating DGIdb file...') + + dgidb = 'DGIdb_interactions.tsv' + dgidb_dwn = 'DGIdb_interactions_dwn.tsv' + + #Clean in the output file those lines that are matches not found and header lines introduced in each access + outputh = open (pro_dir+dgidb, 'w') + inputh = open (dwn_dir+dgidb_dwn, 'r') + outputh.write('\t'.join(['gene_name','drug_name','interaction_type','source','gene_categories'])+'\n') + for line in inputh: + line = line.rstrip('\n') + line_a = line.split('\t') + if re.search('Possible suggestions:',line_a[0]) == None and re.search('Unmatched search term:',line_a[0]) == None and re.search('gene_name',line_a[0]) == None: + if line_a[3] not in ['CIViC', 'OncoKB']: + outputh.write(line+'\n') + inputh.close() + outputh.close() + +def process_sabdab(): + + print('Formating SAbDab file...') + + sabdab_file = 'sabdab.tsv' + sabdab_dwn = 'TheraSAbDab_SeqStruc_OnlineDownload.csv' + + sabdab_targets = {'Anthrax Protective Antigen': '', 'APP Abeta 1-40/42;TFRC': 'APP;TFRC', 'AR-C124910XX': '', 'CALCA&CALCB': 'CALCA;CALCB', 'Canine NGFB': '', 'CD41_CD61': 'CD41;CD61', 'CEACAM5&CD3E;CD3E': 'CEACAM5;CD3E', 'Dabigatran': '', 'DNA/H1 Complex': '', 'ERBB2 (Domain II);ERBB2 (Domain IV)': 'ERBB2', 'FN extra domain B': 'FN1', 'FZD Family': 'FZD1;FZD2;FZD3;FZD4;FZD5;FZD6;FZD7;FZD8;FZD9;FZD10', 'Ganglioside GD2': 'B4GALNT1', 'Heat Shock Protein 90 Homolog': 'HSP90', 'HHV gB AD-1': '', 'HIV-1 gp120': '', 'HIV-1 gp120 CD4bs': '', 'HIV-1 gp120 V3': '', 'Idiotope of anti-NeuGc-ganliosides': '', 'IGF1&IGF2': 'IGF1;IGF2', 'IL31 (Canine)': '', 'IL31 (Feline)': '', 'Influenza A HA': '', 'Influenza A HA2': '', 'Influenza B Virus': '', 'ITGA2B_ITGB3': 'ITGA2B;ITGB3', 'ITGA4_ITGB7': 'ITGA4;ITGB7', 'ITGA4_ITGB7&ITGAE_ITGB7': 'ITGA4;ITGAE;ITGB7', 'ITGAV_ITGB3': 'ITGAV;ITGB3', 'MRSV envelope protein': '', 'NGFB (Canine)': '', 'Non-Binding': '', 'NOTCH2&3': 'NOTCH2;NOTCH3', 'PcrV type III secretion system': '', 'PcrV type III secretion system;Polysaccharide synthesis locus (Pseudomonas)': '', 'PDCD1 (Canine)': '', 'Phosphatidylserine': 'PTDSS1', 'pro-GDF-8': 'MSTN', 'Rabies Virus Spike Glycoprotein G': '', 'Rabies Virus Strain ERA GP Ectodomain Epitope G-II': '', 'Rabies Virus Strain ERA GP Ectodomain Epitope G-III': '', 'Rabies Virus Surface Glycoprotein 4 (gp4) Epitope 1': '', 'RhD': 'RHD', 'RSV gpF': '', 'RSV gpF;RSV gpF': '', 'RV Antigenic Site III': '', 'SARS-CoV-2 Spike': '', 'SARS-CoV-2 Spike RBD': '', 'Serotype IATS O11': '', 'Shiga Toxin Type 1': '', 'Shiga Toxin Type 2': '', 'SIRPα': 'SIRPA', 'TGFB1 (Canine)': '', 'Toxin A': '', 'VP2 (Canine)': '', 'Zaire Ebolavirus GP': '', 'Zaire Ebolavirus GP1': '', 'α4β7': 'ITGB7', 'RSV': '', 'SpA': ''} + + sabdab = pd.read_csv(dwn_dir+sabdab_dwn, low_memory=False) + sabdab.fillna('', inplace=True) + + for index, row in sabdab.iterrows(): + if row['Target'] in sabdab_targets.keys(): + sabdab.loc[index, 'Target'] = sabdab_targets[row['Target']] + + sabdab = sabdab.rename(columns={'Therapeutic': 'drug_name', 'Target': 'gene_name'}) + sabdab = sabdab.drop(sabdab.loc[sabdab['gene_name'] == ''].index) + sabdab.to_csv(pro_dir+sabdab_file, index=False, sep='\t', header=True) + +def process_moalmanac(): + + print('Formating MOAlmanac file...') + + moalmanac_file = 'moalmanac.tsv' + moalmanac_dwn = 'moalmanac_dwn.tsv' + + def process_alterations(selection): + alterations = [] + for ele in list(set(selection['feature_type'].tolist())): + alterations.append(ele+' ('+','.join(list(set(selection[selection['feature_type'] == ele]['protein_change'].dropna().tolist())))+')') + return('; '.join(alterations).replace(' ()', '')) + + inputf = pd.read_csv(dwn_dir+moalmanac_dwn, sep ='\t', header=None, low_memory=False) + inputf.columns = ['assertion_id', 'feature_type', 'gene', 'gene1', 'gene2', 'drug', 'resistance', 'sensitivity', 'therapy_type', 'protein_change', 'predictive_implication', 'validated'] + inputf = inputf[inputf['validated'] == True] + inputf = inputf[((inputf['drug'].str.contains('\+') == False) & (inputf['predictive_implication'].isin(['FDA-Approved', 'Guideline', 'Clinical trial', 'Clinical evidence']))) | ((inputf['drug'].str.contains('\+')) & (inputf['predictive_implication'].isin(['FDA-Approved', 'Guideline'])))] + + subset_add = pd.DataFrame() + for i in ('gene', 'gene1', 'gene2'): + subset_add1 = inputf[[i,'drug']] + subset_add1.columns = ['gene', 'drug'] + subset_add = subset_add.append(subset_add1[~subset_add1.isnull().any(axis=1)]) + + subset_add = subset_add.drop_duplicates() + + moalmanac = pd.DataFrame(columns=['gene_name','drug_name','source','response','alteration','therapy_type']) + source = 'MOAlmanac' + for index, row in subset_add.iterrows(): + selection = inputf[((inputf['gene'] == row['gene']) | (inputf['gene1'] == row['gene']) | (inputf['gene2'] == row['gene'])) & (inputf['drug'] == row['drug'])] + + if len(selection) > 0: + drug = row['drug'] + if re.search('\+', row['drug']): + drug = ' + '.join(sorted(drug.split(' + '))) + + response = [] + if True in list(set(selection['sensitivity'].tolist())): + response.append('sensitivity') + alteration_sen = process_alterations(selection[selection['sensitivity'] == True]) + + if True in list(set(selection['resistance'].tolist())): + response.append('resistance') + alteration_res = process_alterations(selection[selection['resistance'] == True]) + + if len(response) > 0: + if len(response) == 2: + alteration = alteration_sen+' / '+alteration_res + elif response[0] == 'sensitivity': + alteration = alteration_sen + elif response[0] == 'resistance': + alteration = alteration_res + + therapy = list(set(selection['therapy_type'].tolist()))[0] + moalmanac = moalmanac.append({'gene_name' : row['gene'], 'drug_name' : drug, 'source' : source, 'response' : ' / '.join(response), 'alteration' : alteration, 'therapy_type' : therapy} , ignore_index=True) + + moalmanac.to_csv(pro_dir+moalmanac_file, index=False, sep='\t', header=True) + +def process_GDSC(): + + print('Formating GDSC...') + + GDSC_int = 'GDSC_intermediate.tsv' + GDSC = 'GDSC.tsv' + GDSC_dwn = 'GDSC_features.csv' + GDSC_tsv = 'GDSC_features.tsv' + + #Obtain the name of the ANOVA downloaded file from GDSC + ANOVA_dwn = '' + listdir = os.listdir(dwn_dir) + for efile in listdir: + if re.search('^ANOVA', efile) != None: ANOVA_dwn = efile + + filei = open(dwn_dir+GDSC_dwn,'r') + fileo = open(pro_dir+GDSC_tsv,'w') + + for line in filei: + line = line.strip('\n') + line_a = line.split(',') + if line_a[0] == 'cell_line_name': + fileo.write('\t'.join(line_a)+'\n') + else: + fileo.write('\t'.join(line_a[0:8])+'\t'+','.join(line_a[8:])+'\n') + + filei.close() + filei.close() + + def label_sens(row): + if row['feature_delta_mean_ic50'] < 0: return 'sensitivity' + else: return 'resistance' + + def obtain_features(row): + if re.search('_mut',row['feature_name']) != None: + return row['feature_name'].replace('_mut','').replace('.',',') + elif re.search('loss.|gain.',row['feature_name']) != None and re.search('\.\.',row['feature_name']) != None: + field_a = row['feature_name'].split('.') + + if re.search('loss|gain',field_a[3]) != None: + pan_list = [] + for ele in row['feature_name'].split('.'): + if re.search('cnaPANCAN',ele): pan_list.append(ele) + field_a = list(set([item for sublist in [y.split(',') for y in [str(x) for x in gfeatures[(gfeatures['genetic_feature'].isin(pan_list)) & (gfeatures['is_mutated'] == 1)]['genes_in_segment']]] for item in sublist])) + return ','.join(field_a) + else: + #Correction of NKX2.1 value + if row['feature_name'] == "gain.cnaPANCAN393..FOXA1.NKX2.1.PSMA6.": + field_a[field_a.index('1')-1:field_a.index('1')+1] = ['-'.join(field_a[field_a.index('1')-1:field_a.index('1')+1])] + return ','.join(field_a[3:len(field_a)-1]) + + elif re.search('loss.|gain.',row['feature_name']) != None and re.search('\.\.',row['feature_name']) == None: + field_a = list(set([item for sublist in [y.split(',') for y in [str(x) for x in gfeatures[(gfeatures['genetic_feature'] == row['feature_name'].split('.')[1]) & (gfeatures['is_mutated'] == 1)]['genes_in_segment']]] for item in sublist])) + + while 'UNCLASSIFIED' in field_a: field_a.remove('UNCLASSIFIED') + return ','.join(field_a) + + def obtain_alt(row): + if re.search('loss.',row['feature_name']) != None: return 'deletion' + elif re.search('gain.',row['feature_name']) != None: return 'amplification' + elif re.search('_mut',row['feature_name']) != None: return 'mutation' + + gfeatures = pd.read_csv(pro_dir+'GDSC_features.tsv', sep='\t', header = 0, dtype={'cell_line_name': str, 'cosmic_sample_id': int, 'gdsc_desc1': str, 'gdsc_desc2': str, 'tcga_desc': str, 'genetic_feature': str, 'is_mutated': int, 'recurrent_gain_loss': str, 'genes_in_segment': str}) + + filei = pd.read_excel(dwn_dir+ANOVA_dwn, sheet_name = 'PANCANCER_ANOVA', header = 0, dtype={'drug_name': str, 'drug_id': int, 'target': str, 'target_pathway': str, 'feature_name': str, 'n_feature_pos': int, 'n_feature_neg': int, 'ic50_effect_size': float, 'log_ic50_mean_pos': float, 'log_ic50_mean_neg': float, 'log_max_conc_tested': float, 'feature_ic50_t_pval': float, 'feature_delta_mean_ic50': float, 'feature_pos_ic50_var': float, 'feature_neg_ic50_var': float, 'feature_pval': float, 'tissue_pval': float, 'msi_pval': float, 'fdr': float, 'tissue_type': str, 'dataset_version': float}) + + filei = filei[(filei['feature_pval'] < 0.001) & (filei['fdr'] < 25)] + + filei['response'] = filei.apply(lambda row: label_sens(row), axis = 1) + filei['gene'] = filei.apply(lambda row: obtain_features(row), axis = 1) + filei['alteration'] = filei.apply(lambda row: obtain_alt(row), axis = 1) + filei['source'] = 'GDSC' + + filei['gene'] = filei['gene'].apply(lambda x: x.split(',')) + filei = filei.explode('gene') + filei = filei.reset_index() + + filei.rename(columns={'gene': 'gene_name'}, inplace=True) + + filei.to_csv(pro_dir+GDSC_int, sep='\t', index=False, header=True) + + fileo = pd.DataFrame(columns=['gene_name','drug_name','source','response','alteration']) + source = 'GDSC' + + for index, row in filei[['drug_name', 'gene_name']].drop_duplicates().iterrows(): + selection = filei[(filei['drug_name'] == row['drug_name']) & (filei['gene_name'] == row['gene_name'])] + + if len(selection) > 0: + + response = [] + (alteration_sen, alteration_res) = ('', '') + + if len(selection.loc[selection['response'] == 'sensitivity']) > 0: + response.append('sensitivity') + alteration_sen = '; '.join(list(set(selection.loc[selection['response'] == 'sensitivity']['alteration'].tolist()))) + if len(selection.loc[selection['response'] == 'resistance']) > 0: + response.append('resistance') + alteration_res = '; '.join(list(set(selection.loc[selection['response'] == 'resistance']['alteration'].tolist()))) + + if len(response) > 0: + if len(response) == 2: + alteration = alteration_sen+' / '+alteration_res + elif response[0] == 'sensitivity': + alteration = alteration_sen + elif response[0] == 'resistance': + alteration = alteration_res + + fileo = fileo.append({'gene_name' : row['gene_name'], 'drug_name' : row['drug_name'], 'source' : source, 'response' : ' / '.join(response), 'alteration' : alteration} , ignore_index=True) + + fileo.to_csv(pro_dir+GDSC, index=False, sep='\t', header=True) + +def process_KEGG_ATC(): + + print('Formating KEGG ATC...') + + KEGG_ATC = 'TargetBasedClassificationKEGG_formated.tsv' + KEGG_ATC_dwn = 'br08310.keg' + + filei = open(dwn_dir+KEGG_ATC_dwn, 'r') + fileo = open(pro_dir+KEGG_ATC, 'w') + + fileo.write('\t'.join(['drug', 'familiy1', 'family2', 'family3'])+'\n') + + elementA = '' + elementB = '' + elementC = '' + elementD = '' + + for line in filei: + line = line.rstrip() + line = line.replace('', ' ') + line = line.replace('', ' ') + line_a = line.split('\t') + + if re.search('^[ABCDEF] *D[\d]{5}', line_a[0]): + pattern = re.compile('^[ABCDEF] *D[\d]{5} +(.+?) \(') + name_s = pattern.search(line_a[0]) + if name_s == None: + pattern = re.compile('^[ABCDEF] *D[\d]{5} +(.+?)$') + name_s = pattern.search(line_a[0]) + name = name_s.group(1) + tipo = '' + if len(line_a) > 1: + elementA_w = elementA + '_'+line_a[1] + elementB_w = elementB + '_'+line_a[1] + elementC_w = elementC + '_'+line_a[1] + elementD_w = elementD + '_'+line_a[1] + else: + (elementA_w, elementB_w, elementC_w, elementD_w) = (elementA, elementB, elementC, elementD) + + if line_a[0][0] == 'C': elementB_w = '' + if line_a[0][0] == 'D': elementC_w = '' + if line_a[0][0] == 'E': elementD_w = '' + + + fileo.write('\t'.join([name,elementA_w,elementB_w,elementC_w,elementD_w])+'\n') + else: + if re.search('^A', line_a[0]): + + pattern = re.compile('^A (.+)') + elementA_s = pattern.search(line_a[0]) + elementA = elementA_s.group(1) + elementB = '' + elementC = '' + elementD = '' + elif re.search('^B', line_a[0]): + + pattern = re.compile('^B (.+) \[') + elementB_s = pattern.search(line_a[0]) + if elementB_s == None: + pattern = re.compile('^B (.+)$') + elementB_s = pattern.search(line_a[0]) + elementB = elementB_s.group(1) + elementC = '' + elementD = '' + elif re.search('^C', line_a[0]): + + pattern = re.compile('^C (.+?) \[') + elementC_s = pattern.search(line_a[0]) + if elementC_s == None: + pattern = re.compile('^C (.+?)$') + elementC_s = pattern.search(line_a[0]) + elementC = elementC_s.group(1) + elementD = '' + elif re.search('^D', line_a[0]): + + pattern = re.compile('^D (.+?) \[') + elementD_s = pattern.search(line_a[0]) + if elementD_s == None: + pattern = re.compile('^D (.+?)$') + elementD_s = pattern.search(line_a[0]) + elementD = elementD_s.group(1) + + filei.close() + fileo.close() + +def process_cmap(): + + print('Formating CMAP MOA...') + + CMAP_MOA = 'cmap_moa.tsv' + + fileo = open(pro_dir+CMAP_MOA ,'w') + fileo.write('\t'.join(['drug', 'moa'])+'\n') + + with open(dwn_dir+CMAP_MOA) as f: + for line in f: + data = json.loads(line) + for e in data: + fileo.write('\t'.join([e['pert_iname'], e['name']])+'\n') + fileo.close() + +def process_FDA(): + + print('Formating FDA file...') + + fda_status_file = 'fda_status.tsv' + #Obtain the name of the FDA zip downloaded + fda_dwn = '' + listdir = os.listdir(dwn_dir) + for efile in listdir: + if re.search('^drugsatfda', efile) != None: fda_dwn = efile + + df_dict = dict() + with zipfile.ZipFile(dwn_dir+fda_dwn) as z: + for filename in z.namelist(): + if filename == 'MarketingStatus.txt': + # read the file + with z.open(filename) as f: + data = pd.read_csv(f, sep ='\t', header=0, low_memory=False) + df_dict['markstat'] = data + elif filename == 'Products.txt': + with z.open(filename) as f: + content = f.read() + content = str(content, 'utf-8').replace('\t\r\n', '\r\n') + data = pd.read_csv(StringIO(content), sep ='\t', header=0, low_memory=False) + df_dict['products'] = data + elif filename == 'MarketingStatus_Lookup.txt': + with z.open(filename) as f: + data = pd.read_csv(f, sep ='\t', header=0, low_memory=False) + df_dict['markstatlook'] = data + + #create the output file + fda_drug_list = {} + for index, row in df_dict['products'].iterrows(): + compound = row['ActiveIngredient'].split('; ') + applno = row['ApplNo'] + for comp in compound: + status = df_dict['markstatlook'][df_dict['markstatlook']['MarketingStatusID'] == df_dict['markstat'][df_dict['markstat']['ApplNo'] == applno]['MarketingStatusID'].tolist()[0]]['MarketingStatusDescription'].tolist() + if comp in fda_drug_list.keys(): + fda_drug_list[comp][0] = fda_drug_list[comp][0]+status + fda_drug_list[comp][1] = fda_drug_list[comp][1]+[applno] + else: + fda_drug_list[comp] = [status, [applno]] + + for comp in fda_drug_list: + if 'Prescription' in fda_drug_list[comp][0] or 'Over-the-counter' in fda_drug_list[comp][0]: fda_drug_list[comp][0] = 'Approved' + elif 'Discontinued' in fda_drug_list[comp][0]: fda_drug_list[comp][0] = 'Withdrawn' + else: fda_drug_list[comp][0] = '' + fda_drug_list[comp][1] = ';'.join([str(x) for x in list(set(fda_drug_list[comp][1]))]) + + fda_status = pd.DataFrame(list(fda_drug_list.items()), columns=['drug','data']) + fda_status = pd.concat([fda_status, pd.DataFrame(fda_status['data'].to_list(), columns = ['status', 'ApplNo'])], axis=1) + fda_status = fda_status.drop('data', axis=1) + fda_status.to_csv(pro_dir+fda_status_file, index=False, sep='\t') + +def process_FDA_label(): + + print('Formating FDA label file...') + + FDA_label = 'fda_labels.tsv' + + fileo = open(pro_dir+FDA_label,'w') + fileo.write("\t".join(['indication', 'ApplNo'])+'\n') + + #Obtain the name of the FDA zip downloaded + listdir = os.listdir(dwn_dir) + + for efile in listdir: + + if re.search('^drug\-label', efile) != None: + + with zipfile.ZipFile(dwn_dir+efile) as z: + for filename in z.namelist(): + # read the file + with z.open(filename) as f: + data = json.load(f) + for e in data['results']: + if 'indications_and_usage' in e.keys() and 'application_number' in e['openfda'].keys(): + fileo.write("\t".join([e['indications_and_usage'][0], e['openfda']['application_number'][0]])+'\n') + + fileo.close() + +def process_EMA(): + + print('Formating EMA file...') + + ema_file = 'ema_status.tsv' + EMA_dwn = 'Medicines_output_european_public_assessment_reports.xlsx' + + filei = pd.read_excel(dwn_dir+EMA_dwn) + filei.columns = filei.loc[filei.iloc[:,0] == 'Category'].values.tolist()[0] + selection = filei.loc[filei['Category'] == 'Human'][['Medicine name', 'Therapeutic area', 'International non-proprietary name (INN) / common name', 'Active substance', 'Authorisation status', 'Condition / indication']] + + selection.to_csv(pro_dir+ema_file, index=False, sep='\t') + +def process_ct(): + + print('Formating Clinical Trial file...') + + ct = 'clinicaltrials.tsv' + ct_dwn = 'AllPublicXML.zip' + + fileo = open(pro_dir+ct,'w') + fileo.write("\t".join(['condition', 'title', 'status', 'drug'])+'\n') + + with zipfile.ZipFile(dwn_dir+ct_dwn) as z: + for i in progressbar.progressbar(range(len(z.infolist()))): + if re.search('.xml$', z.infolist()[i].filename): + with z.open(z.infolist()[i].filename) as f: + [title,status,condition,drug] = ['','','',''] + tree = ET.ElementTree(file=f) + for elem in tree.iterfind('official_title'): + title = elem.text + for elem in tree.iterfind('overall_status'): + status = elem.text + for elem in tree.iterfind('condition'): + condition = elem.text + for elem in tree.iterfind('intervention/intervention_type'): + if elem.text == 'Drug': + for elem in tree.iterfind('intervention/intervention_name'): + drug = elem.text + + if drug != '': fileo.write('\t'.join([condition,title,status,drug])+'\n') + + fileo.close() + +def process_cgc_oncovar(): + + print('Creating file oncovar scores...') + + oncovar_file = 'oncovar_scores.tsv' + oncovar_dwn = 'TCGA.PanCancer.all.genes.OncoVar.tsv.gz' + + oncovar_tumor_dir = 'All_genes_OncoVar_TCGA.tar.gz' + oncovar = '' + + tar = tarfile.open(name=dwn_dir+oncovar_tumor_dir) + + output = open(pro_dir+oncovar_file, 'w') + output.write('\t'.join(['gene', 'cancer_type', 'P_value', 'FDR', 'OncoVar_Score', 'Consensus_Score', 'Driver_Level'])+'\n') + + for member in tar.getnames(): + if re.search('.gz$', member): + f=tar.extractfile(member) + data = pd.read_csv(f, compression='gzip', sep='\t', low_memory=False, header=0, encoding = 'ISO-8859-1') + for index, row in data.iterrows(): + if row['Cancer'] == 'PanCancer': + cancer = row['Cancer'] + else: + cancer = row['Cancer'].split('[')[1] + cancer = cancer.replace(']', '') + output.write('\t'.join([row['Gene_symbol'], cancer, str(row['P_value']), str(row['FDR']), str(row['OncoVar_Score']), str(row['Consensus_Score']), str(row['Driver Level'])])+'\n') + if re.search(oncovar_dwn, member): + oncovar = data + + print('Creating file with ONC and TSG classification...') + + cgc_file = 'cancer_gene_census.csv' + cgc = pd.read_csv(dwn_dir+cgc_file, low_memory=False, header=0, encoding = 'ISO-8859-1') + cgc = cgc.fillna('') + + fileo = open(pro_dir+'generole.tsv','w') + fileo.write("\t".join(['gene', 'CGC', 'OncoKB', '2020Rule', 'CTAT', 'Oncogene', 'TSgene']) + '\n') + + genes = list(set(cgc['Gene Symbol'].tolist() + oncovar['Gene_symbol'].tolist())) + + for gene in genes: + + categories = [cgc.loc[cgc['Gene Symbol'] == gene]['Role in Cancer'], oncovar.loc[oncovar['Gene_symbol'] == gene]['OncoKB'], oncovar.loc[oncovar['Gene_symbol'] == gene]['2020Rule'], oncovar.loc[oncovar['Gene_symbol'] == gene]['CTAT'], oncovar.loc[oncovar['Gene_symbol'] == gene]['Oncogene'], oncovar.loc[oncovar['Gene_symbol'] == gene]['TSgene']] + + for idx, cat in enumerate(categories): + list_cat = [] + if not cat.empty: + value = cat.tolist()[0] + if isinstance(value, str): + subcats = value.split(', ') + subcats = [x.split('/') for x in subcats] + subcats = [item for sublist in subcats for item in sublist] + + for subcat in subcats: + if subcat not in list_cat: list_cat.append(subcat.upper()) + + if idx == 4: + if list_cat[0] == 'Y': list_cat[0] = 'ONC' + elif idx == 5: + if list_cat[0] == 'Y': list_cat[0] = 'TSG' + + list_cat = ['ONC' if x in ['FUSION', 'ONCOGENE'] else x for x in list_cat] + list_cat = ['' if x in ['N', 'Y'] else x for x in list_cat] + + categories[idx] = '/'.join(list(set(list_cat))) + + fileo.write("\t".join([gene]+categories) + '\n') + + fileo.close() + + print('Creating file with CGC info for scores...') + + cgc = 'cgc_scores.tsv' + + fileo = open(pro_dir+cgc,'w') + fileo.write("\t".join(['gene', 'cancer_type_source', 'Tier']) + '\n') + + for index, row in cgc.iterrows(): + tumors = row['Tumour Types(Somatic)'].split(', ')+row['Tumour Types(Germline)'].split(', ') + tumors = [x for x in tumors if x != ''] + for t in list(set(tumors)): + fileo.write("\t".join([row['Gene Symbol'], t, str(row['Tier'])]) + '\n') + + fileo.close() + +def process_KEGG_ind(): + + print('Creating pathway member information...') + + paths = ['hsa03320', 'hsa04010', 'hsa04012', 'hsa04014', 'hsa04015', 'hsa04020', 'hsa04022', 'hsa04024', 'hsa04062', 'hsa04064', 'hsa04066', 'hsa04068', 'hsa04071', 'hsa04110', 'hsa04114', 'hsa04115', 'hsa04150', 'hsa04151', 'hsa04152', 'hsa04210', 'hsa04261', 'hsa04270', 'hsa04310', 'hsa04330', 'hsa04340', 'hsa04350', 'hsa04370', 'hsa04390', 'hsa04510', 'hsa04520', 'hsa04530', 'hsa04540', 'hsa04611', 'hsa04620', 'hsa04621', 'hsa04622', 'hsa04630', 'hsa04650', 'hsa04660', 'hsa04662', 'hsa04664', 'hsa04666', 'hsa04668', 'hsa04670', 'hsa04722', 'hsa04910', 'hsa04912', 'hsa04914', 'hsa04915', 'hsa04916', 'hsa04919', 'hsa04920', 'hsa04921', 'hsa04922', 'hsa04971', 'hsa05010', 'hsa05012', 'hsa05160', 'hsa05200', 'hsa05205', 'hsa05212', 'hsa05214', 'hsa05218', 'hsa05231'] + + global dwn_dir + global pro_dir + + dwn_dir_KEGG = dwn_dir+'KEGGmodeled/' + pro_dir_KEGG = pro_dir+'KEGGmodeled' + + if not os.path.exists(pro_dir_KEGG): os.makedirs(pro_dir_KEGG) + + if not os.path.exists(pro_dir_KEGG+'/'+'components'): os.makedirs(pro_dir_KEGG+'/'+'components') + if not os.path.exists(pro_dir_KEGG+'/'+'relations'): os.makedirs(pro_dir_KEGG+'/'+'relations') + if not os.path.exists(pro_dir_KEGG+'/'+'subpaths'): os.makedirs(pro_dir_KEGG+'/'+'subpaths') + if not os.path.exists(pro_dir_KEGG+'/'+'dictionaries'): os.makedirs(pro_dir_KEGG+'/'+'dictionaries') + + pro_dir_KEGG = pro_dir_KEGG+'/' + + gene_symbol_dic = {} + + def retrieve_symbol(gene): + + if gene not in gene_symbol_dic.keys(): + + import sys + from Bio import Entrez + + Entrez.email = "" #place email + + request = Entrez.epost("gene", id=gene) + + result = Entrez.read(request) + + webEnv = result["WebEnv"] + queryKey = result["QueryKey"] + data = Entrez.esummary(db="gene", webenv=webEnv, query_key=queryKey) + annotations = Entrez.read(data) + + gene_symbol = annotations["DocumentSummarySet"]["DocumentSummary"][0]["NomenclatureSymbol"] + + gene_symbol_dic[gene] = gene_symbol + + else: gene_symbol = gene_symbol_dic[gene] + + return gene_symbol + + def parse_kegg_xml(): + + for path in paths: + + output_components = open(pro_dir_KEGG+'components/'+path+'.components.tsv','w') + output_relations = open(pro_dir_KEGG+'relations/'+path+'.relations.tsv','w') + + tree = ET.parse(dwn_dir_KEGG+path+'.kgml.xml') + root = tree.getroot() +# print (root.tag) +# print (root.attrib) +# for child in root: +# print(child.tag, child.attrib) +# pdb.set_trace() + group = {} + for entry in root.iter('entry'): + if entry.attrib['type'] not in ['ortholog', 'map']: + ID = entry.attrib['id'] + if entry.attrib['type'] == 'group': + group[ID] = [] + for component in entry.iter('component'): + group[ID].append(component.attrib['id']) + else: + components = entry.attrib['name'].replace('hsa:','') + components = components.replace('cpd:','') + components = components.replace('dr:','') + components = components.replace('gl:','').split(' ') + output_components.write('\t'.join([ID,':'.join(components)])+'\n') + + for relation in root.iter('relation'): + entry1 = [relation.attrib['entry1']] + entry2 = [relation.attrib['entry2']] + TYPE = relation.attrib['type'] + + #if ID belongs to a group the relationship is repited for each component and the name will be the combined one + if entry1[0] in group.keys(): + entry1 = group[entry1[0]] + + if entry2[0] in group.keys(): + entry2 = group[entry2[0]] + + r_product = list(itertools.product(entry1, entry2)) + + (name, value) = ([], []) + for subtype in relation.iter('subtype'): + name.append(subtype.attrib['name']) + value.append(subtype.attrib['value']) + + for r in r_product: + output_relations.write('\t'.join([r[0],r[1],TYPE,','.join(name),','.join(value)])+'\n') + + output_components.close() + output_relations.close() + + def nodes_ids_and_symbols(): + for i in progressbar.progressbar(range(len(paths))): + relations = pd.read_csv(pro_dir_KEGG+'relations/'+paths[i]+'.relations.tsv', sep='\t', low_memory=False, header=None) + relations.columns = ['entry1', 'entry2', 'type', 'name', 'value'] + components = pd.read_csv(pro_dir_KEGG+'components/'+paths[i]+'.components.tsv', sep='\t', low_memory=False, header=None) + components.columns = ['ID', 'components'] + + fileo = open(pro_dir_KEGG+'dictionaries/'+paths[i]+'.codes.tsv', 'w') + fileo.write('\t'.join(['node', 'id', 'symbol'])+'\n') + + nodes = list(set(relations['entry1'].tolist() +relations['entry2'].tolist())) + + for node in nodes: + components_id = components.loc[components['ID'] == node]['components'].tolist()[0] + if re.search('\.', components_id): + ids = components.loc[components['ID'] == entry]['components'].tolist()[0].split('.') + for i, t in enumerate(transl): + transl[i] = components.loc[components['ID'] == int(t)]['components'].tolist()[0] + else: + ids = components.loc[components['ID'] == node]['components'].tolist() + + ids = [x for ele in ids for x in ele.split(':')] + + symbols = [] + for el in ids: + if re.search('(C|G)', el) == None: + symbol = retrieve_symbol(el) + if symbol != '': symbols.append(symbol) + + fileo.write('\t'.join([str(node), ':'.join(ids),':'.join(symbols)])+'\n') + + fileo.close() + + def translate_codes_to_symbols(): + for i in progressbar.progressbar(range(len(paths))): + relations = pd.read_csv(pro_dir_KEGG+'relations/'+paths[i]+'.elemcode.tsv', sep='\t', low_memory=False, header=None) + relations.columns = ['entry1', 'entry2', 'type', 'name', 'value'] + fileo = open(pro_dir_KEGG+'relations/'+paths[i]+'.symbol.tsv', 'w') + + for index, row in relations.iterrows(): + entries = [row['entry1'], row['entry2']] + for idx, entry in enumerate(entries): + transl = [] + for el in entry.split(':'): + if re.search('(C|G)', el) == None: + symbol = retrieve_symbol(el) + transl.append(symbol) + else: + transl.append('') + + entries[idx] = ':'.join(transl) + + fileo.write('\t'.join(entries+row[2:].tolist())+'\n') + + fileo.close() + + def obtain_subroutes(): + import networkx as nx + for i in progressbar.progressbar(range(len(paths))): + fileo = open(pro_dir_KEGG+'subpaths/'+paths[i]+'.symbol.tsv','w') + filei = pd.read_csv(pro_dir_KEGG+'relations/'+paths[i]+'.relations.tsv', sep='\t', low_memory=False, header=None) + filei.columns = ['entry1', 'entry2', 'type', 'name', 'value'] + dictionary = pd.read_csv(pro_dir_KEGG+'dictionaries/'+paths[i]+'.codes.tsv', sep='\t', low_memory=False, header=0) + generole = pd.read_csv(pro_dir+'generole.tsv', sep='\t', low_memory=False, header=0) + + entry_list = [list(a) for a in zip(filei.entry1, filei.entry2)] + + # Create graph + G = nx.Graph() + # Fill graph with data + G.add_edges_from(entry_list) + + all_paths = [] + #obtain all combinations of paths of 4 components from node to node + for start in G.nodes: + for end in G.nodes: + if start != end: + all_paths = all_paths + list(nx.all_simple_paths(G, start, end, 3)) + + #remove duplicates + import itertools + all_paths.sort() + all_paths_dedup = [list(a) for a in (all_paths for all_paths,_ in itertools.groupby(all_paths))] + + #discard elements after a non-valid association, discarded next valid elements will appear in the begining of another path retrieved by all_simple_paths + #this allows also to remove the nodes linked in backward direction from all_simple_paths + generole = pd.read_csv(pro_dir+'generole.tsv', sep='\t', low_memory=False, header=0) + all_paths_checked = [] + import more_itertools + for path in all_paths_dedup: + path_it = [] + for pair in list(more_itertools.windowed(path,n=2, step=1)): + associations = [item for sublist in [x.split(',') for x in filei.loc[(filei['entry1'] == pair[0]) & (filei['entry2'] == pair[1])]['name'].tolist()] for item in sublist] + + if len(associations) > 0: #valid: association is in relations file + if len(set(associations) & set(['indirect effect'])) == 0: #valid: asociation is not indirect + if str(dictionary.loc[dictionary['node'] == pair[0]]['symbol'].tolist()[0]) != 'nan' and str(dictionary.loc[dictionary['node'] == pair[1]]['symbol'].tolist()[0]) != 'nan': #valid: node has gene symbol attached + #update source node + genes = [] + for gene in dictionary.loc[dictionary['node'] == pair[0]]['symbol'].tolist()[0].split(':'): + #decide the role of the gene + roles = ':'.join(list(set([x for x in generole.loc[generole['gene'] == gene].values.flatten().tolist()[1:] if str(x) != 'nan']))) + #check if there is a restrictive association for the pair, in that case the pair is discarded + if (roles in ['ONC', ''] and len(set(associations) & set(['activation', 'expression'])) > 0) or (roles == 'TSG' and len(set(associations) & set(['inhibition', 'repression'])) > 0) or (roles not in ['ONC', '', 'TSG']) or (len(set(['activation', 'expression', 'inhibition', 'repression']) & set(associations)) == 0): + genes.append(gene) + + gene_names = ':'.join(genes) + if gene_names != '': #association only saved if genes in node + if len(path_it) == 0: path_it = [gene_names] + else: path_it[-1] = gene_names + + #report destination node + genes = [] + for gene in dictionary.loc[dictionary['node'] == pair[1]]['symbol'].tolist()[0].split(':'): + genes.append(gene) + gene_names = ':'.join(genes) + path_it.append(gene_names) + + else: + break + else: + break + else: + break + + all_paths_checked.append(path_it) + + #remove duplicates + all_paths_checked = [x for x in all_paths_checked if len(x) > 1] + all_paths_checked.sort() + all_paths_checked_dedup = [list(a) for a in (all_paths_checked for all_paths_checked,_ in itertools.groupby(all_paths_checked))] + + #remove pathways included in other ones + all_paths_unified = [] + for path in all_paths_checked_dedup: + found = False + for path_search in all_paths_checked_dedup: + #if the length of both the path being searched and the path for the search is the same in the match, the search is on itself because redundances have been removed + if any(path == path_search[i:i+len(path)] for i in range(len(path_search))) and len(path) != len(path_search): + found = True + break + if not found: + all_paths_unified.append(path) + + for path in all_paths_unified: + fileo.write('\t'.join([str(x) for x in path])+'\n') + fileo.close() + + def obtain_upstream_genes(): + upstream_genes = {} + for i in progressbar.progressbar(range(len(paths))): + filei = open(pro_dir_KEGG+'subpaths/'+paths[i]+'.symbol.tsv', 'r') + for line in filei: + line = line.rstrip('\n') + line_a = line.split('\t') + for idx, col in enumerate(line_a[1:]): + for gene in col.split(':'): + if gene == 'MAPK1' and 'CDK5' in [x for ele in line_a[:idx+1] for x in ele.split(':')]: + print(paths[i]) + print(line_a) + if gene in upstream_genes.keys(): + upstream_genes[gene] = upstream_genes[gene] + [x for ele in line_a[:idx+1] for x in ele.split(':')] + else: + upstream_genes[gene] = [x for ele in line_a[:idx+1] for x in ele.split(':')] + filei.close() + fileo = open(pro_dir_KEGG+'upstream_genes.tsv', 'w') + fileo.write('\t'.join(['gene', 'upstream_genes'])+'\n') + for gene in upstream_genes: + if gene != '': + upstream_genes_list = list(set(upstream_genes[gene])) + upstream_genes_list = list(filter(None, upstream_genes_list)) + if len(upstream_genes_list) > 0: + fileo.write('\t'.join([gene, '|'.join(upstream_genes_list)])+'\n') + fileo.close() + + parse_kegg_xml() + nodes_ids_and_symbols() + obtain_subroutes() + obtain_upstream_genes() + +def process_gene_names(): + + print('Creating file with checked gene symbols...') + + genes_chk = 'genes_checked.tsv' + + genes = [] + + dgidb = 'DGIdb_interactions.tsv' + sabdab_file = 'sabdab.tsv' + moalmanac_file = 'moalmanac.tsv' + GDSC = 'GDSC.tsv' + drugbank_file = 'DrugBank.tsv' + + files = [dgidb, sabdab_file, moalmanac_file, GDSC, drugbank_file] + + for f in files: + inputf=open(pro_dir+f,'r') + for line in inputf: + line = line.rstrip('\n') + line_a = line.split('\t') + if 'gene_name' in line_a: gene_index = line_a.index('gene_name') + if f == GDSC: genes = genes+line_a[gene_index].split(',') + elif f == sabdab_file: genes = genes+line_a[gene_index].split(';') + else: genes.append(line_a[gene_index]) + inputf.close() + + outputf = open(pro_dir+genes_chk, 'w') + outputf.write('\t'.join(['gene_name', 'checked_gene_symbol'])+'\n') + + import httplib2 as http + import json + + try: + from urlparse import urlparse + except ImportError: + from urllib.parse import urlparse + + headers = { + 'Accept': 'application/json', + } + + uri = 'http://rest.genenames.org' + method = 'GET' + body = '' + h = http.Http() + + genes = list(set(genes)) + + for i in progressbar.progressbar(range(len(genes))): + found = False + symbol = '' + + path = '/search/symbol/'+genes[i] + + target = urlparse(uri+path) + + response, content = h.request(target.geturl(), method, body, headers) + + if response['status'] == '200': + # assume that content is a json reply + # parse content with the json module + data = json.loads(content) + if len(data['response']['docs']) > 0: + outputf.write('\t'.join([genes[i], data['response']['docs'][0]['symbol']])+'\n') + found = True + else: + print ('Error detected: ' + response['status']) + + if not found: + path = '/search/alias_symbol/'+genes[i] + target = urlparse(uri+path) + + response, content = h.request(target.geturl(), method, body, headers) + + if response['status'] == '200': + data = json.loads(content) + if len(data['response']['docs']) > 0: + outputf.write('\t'.join([genes[i], data['response']['docs'][0]['symbol']])+'\n') + found = True + else: + print ('Error detected: ' + response['status']) + + if not found: + path = '/search/prev_symbol/'+genes[i] + target = urlparse(uri+path) + + response, content = h.request(target.geturl(), method, body, headers) + + if response['status'] == '200': + data = json.loads(content) + if len(data['response']['docs']) > 0: + outputf.write('\t'.join([genes[i], data['response']['docs'][0]['symbol']])+'\n') + found = True + else: + print ('Error detected: ' + response['status']) + + if not found: + print('Not found:'+genes[i]) + outputf.write('\t'.join([genes[i], genes[i]])+'\n') + + outputf.close() + +def process_civic(): + + print('Processing civic file...') + + civic_file = 'civic.tsv' + civic_dwn = 'civic_evidence.tsv' + + output = open(pro_dir+civic_file, 'w') + output.write('\t'.join(['drug_name', 'gene_name', 'variation', 'response', 'source'])+'\n') + civic = pd.read_csv(dwn_dir+civic_dwn, sep ='\t', low_memory=False) + + civic_select = civic.loc[(civic['status'] == 'ACCEPTED') & (civic['evidence_direction'] == 'SUPPORTS') & (civic['clinical_significance'].isin(['RESISTANCE', 'SENSITIVITYRESPONSE'])) & (civic['evidenceLevel'].isin(['A']))] # A: Validated association + + drug_gene = civic_select.drop_duplicates(subset=['drug', 'gene']) + + for index, row in drug_gene.iterrows(): + alteration_sen = [] + alteration_res = [] + civic_subset = civic_select.loc[(civic_select['drug'] == row['drug']) & (civic_select['gene'] == row['gene'])] + + for index2, row2 in civic_subset.iterrows(): + if row2['clinical_significance'] == 'SENSITIVITYRESPONSE': + alteration_sen.append(row2['variant']) + else: + alteration_res.append(row2['variant']) + + if len(alteration_sen) > 0 and len(alteration_res) > 0: + response = 'sensitivity / resistance' + variation = '; '.join(list(set(alteration_sen)))+' / '+'; '.join(list(set(alteration_res))) + elif len(alteration_sen) > 0: + response = 'sensitivity' + variation = '; '.join(list(set(alteration_sen))) + else: + response = 'resistance' + variation = '; '.join(list(set(alteration_res))) + + + output.write('\t'.join([row['drug'], row['gene'], variation, response, 'CIViC'])+'\n') + output.close() + +def process_oncoKB(): + + print('Processing oncoKB file...') + + oncokb_file = 'oncokb.tsv' + oncokb_dwn = 'oncokb_biomarker_drug_associations.tsv' + + output = open(pro_dir+oncokb_file, 'w') + output.write('\t'.join(['drug_name', 'gene_name', 'variation', 'response', 'source'])+'\n') + oncokb = pd.read_csv(dwn_dir+oncokb_dwn, sep ='\t', low_memory=False) + + oncokb_select = oncokb.loc[(oncokb['Level'].isin(['1', '2', 'R1'])) & (oncokb['Gene'] != 'Other Biomarkers')] # 1, 2 and R1 (top evidence) + oncokb_select = oncokb_select.rename(columns={'Drugs (for therapeutic implications only)':'Drugs'}) + oncokb_select['Drugs'] = oncokb_select['Drugs'].apply(lambda x: x.split(', ')) + oncokb_select = oncokb_select.explode('Drugs') + oncokb_select = oncokb_select.reset_index() + + for index, row in oncokb_select.iterrows(): + d = row['Drugs'] + if re.search('\+', d): + d = ' + '.join(sorted(d.split(' + '))) + oncokb_select.loc[index, ['Drugs']] = d + + drug_gene = oncokb_select.drop_duplicates(subset=['Drugs', 'Gene']) + + for index, row in drug_gene.iterrows(): + alteration_sen = [] + alteration_res = [] + oncokb_subset = oncokb_select.loc[(oncokb_select['Drugs'] == row['Drugs']) & (oncokb_select['Gene'] == row['Gene'])] + + for index2, row2 in oncokb_subset.iterrows(): + if re.search('excluding', row2['Alterations']) != None: + alt = [row2['Alterations']] + else: + alt = row2['Alterations'].split(', ') + + if row2['Level'] in ['1', '2', '3A']: + alteration_sen = alteration_sen + alt + else: + alteration_res = alteration_res + alt + + if len(alteration_sen) > 0 and len(alteration_res) > 0: + response = 'sensitivity / resistance' + variation = '; '.join(sorted(list(set(alteration_sen))))+' / '+'; '.join(sorted(list(set(alteration_res)))) + elif len(alteration_sen) > 0: + response = 'sensitivity' + variation = '; '.join(sorted(list(set(alteration_sen)))) + else: + response = 'resistance' + variation = '; '.join(sorted(list(set(alteration_res)))) + + output.write('\t'.join([row['Drugs'], row['Gene'], variation, response, 'OncoKB'])+'\n') + output.close() + +def process_intogen(): + + print('Processing intogen file...') + + intogen = 'intogen.tsv' + intogen_dwn = 'download' + + output = open(pro_dir+intogen, 'w') + output.write('\t'.join(['gene_name', 'cohort', 'cancer_type', 'qvalue_combination'])+'\n') + + with zipfile.ZipFile(dwn_dir+intogen_dwn) as z: + for i in z.infolist(): + if re.search('Compendium_Cancer_Genes.tsv', i.filename): + with z.open(i.filename) as f: + data = pd.read_csv(f, sep ='\t', header=0, low_memory=False) + for index, row in data.iterrows(): + output.write('\t'.join([row['SYMBOL'], row['COHORT'], row['CANCER_TYPE'], str(row['QVALUE_COMBINATION'])])+'\n') + + output.close() + +def process_depmap(): + + print('Processing DepMap public score + Chronos...') + + depmap_dwn = 'CRISPR_gene_effect.csv' + depmap_pro = 'chronos_skew.tsv' + + matrix = pd.read_csv(dwn_dir+depmap_dwn, low_memory=False) + matrix = matrix.set_index('DepMap_ID') + matrix.columns = [x.split(' ')[0] for x in matrix.columns.tolist()] + skewness = matrix.apply(lambda x : scipy.stats.skew(x, nan_policy='omit')) + skewness_min = pd.DataFrame(skewness) + skewness_min['min'] = skewness[skewness > -0.5].min() + + skewness_min.to_csv(pro_dir+depmap_pro, index=True, sep='\t', header=False) + +def process_KEGG_pathways(): + + print('Processing KEGG gene pathway file...') + + genepathway_file = 'gene_pathway.tsv' + + inputf = pd.read_csv(dwn_dir+genepathway_file, sep ="\t", header=None, low_memory=False) + + inputf.columns =['KEGG Gene ID', 'KEGG Pathway ID'] + inputf['KEGG Gene ID'] = inputf['KEGG Gene ID'].str.replace('hsa:','') + inputf['KEGG Pathway ID'] = inputf['KEGG Pathway ID'].str.replace('path:','') + + array_agg = lambda x: '|'.join(x.astype(str)) + inputf = inputf.groupby('KEGG Gene ID').agg({'KEGG Pathway ID': array_agg}) + + inputf.to_csv(pro_dir+genepathway_file, sep = "\t") + + print('Processing KEGG pathway descriptions file...') + + pathwaydesc_file = 'pathway_desc.tsv' + + inputf = pd.read_csv(dwn_dir+pathwaydesc_file, sep ="\t", header=None, low_memory=False) + + inputf.columns =['KEGG Pathway ID', 'KEGG Pathway desc'] + inputf['KEGG Pathway ID'] = inputf['KEGG Pathway ID'].str.replace('path:','') + inputf['KEGG Pathway desc'] = inputf['KEGG Pathway desc'].str.replace(' \- Homo sapiens \(human\)','') + + inputf.to_csv(pro_dir+pathwaydesc_file, sep = "\t", index=False) + +def process_SL(): + + print('Processing SL dependencies file...') + + sl_file = 'all_genetic_dependencies.tsv' + + inputf = pd.read_csv(dwn_dir+sl_file, sep ="\t", low_memory=False) + outputf = open(pro_dir+'genetic_dependencies.tsv','w') + outputf.write('\t'.join(['gene','genetic_dependency'])+'\n') + + sel = inputf.loc[inputf['padj'] < 0.25].groupby("dependency") + + for dependency, frame in sel: + dep_genes = [] + for index, row in sel.get_group(dependency).iterrows(): + if not row['gene'] == dependency: + dep_genes.append(row['gene']+'('+row['alteration']+')') + if len(dep_genes) > 0: + outputf.write('\t'.join([dependency,'|'.join(dep_genes)])+'\n') + + outputf.close() + +def process_file(filename): + print(filename) + cosmic = {} + gene_freq = {} + mut_freq = {} + + inputf = pd.read_csv(pro_dir+'temp/'+filename, sep ="\t", header=0, low_memory=False) + + if not inputf.empty: + inputf[['Gene name']] = inputf['Gene name'].str.split('_',expand=True)[[0]] + inputf[['HGVSC_Transcript','HGVSC']] = inputf['HGVSC'].str.split(':',expand=True) + inputf[['HGVSC_Transcript']] = inputf['HGVSC_Transcript'].str.split('.',expand=True)[[0]] + outputf = open(pro_dir+'temp/'+filename.replace('.tsv', '_prc.tsv'), 'w') + + gene_list = inputf['Gene name'].unique().tolist() + for gene in gene_list: + samples_gene = len(inputf.loc[inputf['Gene name'] == gene]['ID_sample'].unique().tolist()) + mut_list = inputf.loc[inputf['Gene name'] == gene]['GENOMIC_MUTATION_ID'].unique().tolist() + for mut in mut_list: + samples_mut = len(inputf.loc[inputf['GENOMIC_MUTATION_ID'] == mut]['ID_sample'].unique().tolist()) + transcript_list = inputf.loc[(inputf['Gene name'] == gene) & (inputf['GENOMIC_MUTATION_ID'] == mut)]['HGVSC_Transcript'].unique().tolist() + for trans in transcript_list: + FATHMM = ['' if x is np.nan else x for x in inputf.loc[(inputf['Gene name'] == gene) & (inputf['GENOMIC_MUTATION_ID'] == mut) & (inputf['HGVSC_Transcript'] == trans)]['FATHMM prediction'].tolist()][0] + HGVSc = inputf.loc[(inputf['Gene name'] == gene) & (inputf['GENOMIC_MUTATION_ID'] == mut) & (inputf['HGVSC_Transcript'] == trans)]['HGVSC'].tolist()[0] + + outputf.write('\t'.join([':'.join([gene, trans, HGVSc]), mut, FATHMM, str(samples_gene), str(samples_mut), str(total_cosmic_rec)])+'\n') + outputf.close() + +def create_cosmic_temp_files(idx): + outputn = cosmic_file.replace('.tsv.gz','.tsv.filtered.gz') + output_file = pro_dir+'temp/'+'COSMIC_'+str(idx)+'.tsv' + select_column = ['Gene name', 'ID_sample', 'GENOMIC_MUTATION_ID', 'GRCh', 'Mutation genome position', 'FATHMM prediction', 'Mutation somatic status', 'HGVSC'] + outputf = open(output_file,'w') + outputf.write('\t'.join(select_column)+'\n') + outputf.close() + + cleaned_genes_group = [x for x in genes_group[idx] if str(x) != 'None'] + gene_list = ['^'+x+'_' for x in cleaned_genes_group]+['^'+x+'\t' for x in cleaned_genes_group] + + command = 'zcat '+pro_dir+outputn+' | egrep \''+'|'.join(gene_list)+'\' >> '+ output_file + subprocess.call(command,shell=True) + +def process_cosmic(): + + global cosmic_file + cosmic_file = 'CosmicMutantExport.tsv.gz' + + print('Processing COSMIC data...') + + print('-->Filtering COSMIC...') + #filtering columns + inputf = gzip.open(dwn_dir+cosmic_file,'r') + first_line = inputf.readline().rstrip().decode().split('\t') + global outputn + outputn = cosmic_file.replace('.tsv.gz','.tsv.filtered.gz') + select_column = ['Gene name', 'ID_sample', 'GENOMIC_MUTATION_ID', 'GRCh', 'Mutation genome position', 'FATHMM prediction', 'Mutation somatic status', 'HGVSC'] + inputf.close() + + #filtering records + idx = [first_line.index(i) for i in select_column] + cols = [str(i + 1) for i in idx] + #some records have an empty ID (not counted) + command = 'zcat '+dwn_dir+cosmic_file+' | cut -f '+','.join(cols)+' | awk -F"\t" \'($3 != "" && $4 == "38") || $1 == "Gene name" {print}\' | gzip > '+pro_dir+outputn + subprocess.call(command,shell=True) + + #obtaining list of genes + gfile = pro_dir+'gene_list.txt' + command = 'zcat '+pro_dir+outputn+' | cut -f 1 | cut -d\'_\' -f 1 | grep -v Gene | sort | uniq > '+gfile + subprocess.call(command,shell=True) + + print('-->Creating COSMIC file...') + genes_file = pd.read_csv(gfile, sep ="\t", header=None, low_memory=False) + gene_list = genes_file[0].tolist() + + def grouper(n, iterable, fillvalue=None): + import itertools + args = [iter(iterable)] * n + return itertools.zip_longest(*args, fillvalue=fillvalue) + + global genes_group + genes_group = list(grouper(10, gene_list)) + + os.mkdir(pro_dir+'temp/') + a_pool = multiprocessing.Pool(processes=9) + + result_list_tqdm = [] + for result in tqdm(a_pool.imap(create_cosmic_temp_files, range(0,len(genes_group))), total=len(genes_group)): + result_list_tqdm.append(result) + + with gzip.open(pro_dir+outputn,'r') as f: + global total_cosmic_rec + total_cosmic_rec = len(f.readlines()) - 1 #removing the header for the count + + gfreq = open(pro_dir+'cosmic_gene_freq.tsv','w') + gfreq.write('\t'.join(['Gene name', 'Gene_freq', 'Total'])+'\n') + gfreq.close() + gfreq = open(pro_dir+'cosmic_gene_freq.tsv','a') + listdir = os.listdir(pro_dir+'temp/') + for efile in listdir: + inputf = pd.read_csv(pro_dir+'temp/'+efile, sep ="\t", header=0, low_memory=False) + inputf[['Gene name']] = inputf['Gene name'].str.split('_',expand=True)[[0]] + gene_list = inputf['Gene name'].unique().tolist() + for gene in gene_list: + samples_gene = len(inputf.loc[inputf['Gene name'] == gene]['ID_sample'].unique().tolist()) + + gfreq.write('\t'.join([gene, str(samples_gene), str(total_cosmic_rec)])+'\n') + gfreq.close() + + a_pool = multiprocessing.Pool(processes=9) + + result_list_tqdm = [] + for result in tqdm(a_pool.imap(process_file, os.listdir(pro_dir+'temp/')), total=len(os.listdir(pro_dir+'temp/'))): + result_list_tqdm.append(result) + + read_files = glob.glob(pro_dir+'temp/'+"*_prc.tsv") + with open(pro_dir+'COSMIC.tsv', 'ab') as outfile: + outfile.write(bytes('\t'.join(['Cosmic_key', 'cosmic_id', 'FATHMM', 'Gene_freq', 'Mut_freq', 'Total'])+'\n','utf-8')) + for f in read_files: + with open(f, "rb") as infile: + outfile.write(infile.read()) + + shutil. rmtree(pro_dir+'temp/') + +def process_xml_clinvar(ofile): + + outputf = open(ofile,'a') + + tree = ET.parse(xml_section) + root = tree.getroot() + + #print (root.tag) + #print (root.attrib) + #for child in root: + # print(child.tag, child.attrib) + for ClinVarSet in root.iter('ClinVarSet'): + (ACC, ASSEMBLY, CHR, VCF_POS, VCF_REF, VCF_ALT, SYMBOL, TRAIT, SIG) = ([], [], [], [], [], [], [], [], []) + for ReferenceClinVarAssertion in ClinVarSet.iter('ReferenceClinVarAssertion'): + for ClinVarAccession in ReferenceClinVarAssertion.iter('ClinVarAccession'): + ACC.append(ClinVarAccession.attrib['Acc']) + for MeasureSet in ReferenceClinVarAssertion.iter('MeasureSet'): + for Measure in MeasureSet.iter('Measure'): + for SequenceLocation in Measure.iter('SequenceLocation'): + if 'positionVCF' in SequenceLocation.attrib.keys(): + ASSEMBLY.append(SequenceLocation.attrib['Assembly']) + CHR.append(SequenceLocation.attrib['Chr']) + VCF_POS.append(SequenceLocation.attrib['positionVCF']) + VCF_REF.append(SequenceLocation.attrib['referenceAlleleVCF']) + VCF_ALT.append(SequenceLocation.attrib['alternateAlleleVCF']) + for Symbol in Measure.iter('Symbol'): + for ElementValue in Symbol.iter('ElementValue'): + if ElementValue.attrib['Type'] == 'Preferred': SYMBOL.append(ElementValue.text) + for TraitSet in ReferenceClinVarAssertion.iter('TraitSet'): + for Trait in TraitSet: + for Name in Trait.iter('Name'): + for ElementValue in Name.iter('ElementValue'): + if ElementValue.attrib['Type'] == 'Preferred': TRAIT.append(ElementValue.text) + for ClinicalSignificance in ReferenceClinVarAssertion.iter('ClinicalSignificance'): + for Description in ClinicalSignificance.iter('Description'): + SIG.append(Description.text) + + for idx in range(len(ASSEMBLY)): + outputf.write('\t'.join(['::'.join(list(set(ACC))),ASSEMBLY[idx],CHR[idx],VCF_POS[idx],VCF_REF[idx],VCF_ALT[idx],'::'.join(list(set(SYMBOL))),'::'.join(list(set(TRAIT))),'::'.join(list(set(SIG)))])+'\n') + + outputf.close() + +def process_clinvar(): + + print('Processing ClinVar data...') + + clinvar_file = 'ClinVarFullRelease_00-latest.xml.gz' + + print('-->Filtering ClinVar...') + + #filter records + inputf = gzip.open(dwn_dir+clinvar_file,'rb') + outputn = clinvar_file.replace('.xml.gz','tagfiltered.xml.gz') + outputf = gzip.open(pro_dir+outputn,'wb') + + for l in inputf: + if re.search('xml|Creating ClinVar file...') + #create tsv file + global xml_section + xml_section = pro_dir+'xml_temp.xml' + outputf = open(pro_dir+'Clinvar.tsv','w') + outputf.write('\t'.join(['Acc','Assembly','Chr','VCF_pos','VCF_ref','VCF_alt','Gene','Trait','Significance'])+'\n') + outputf.close() + + #counter for ClinVarSet tag + counter = 0 + xml = gzip.open(pro_dir+outputn, 'rb') + + for line in xml: + line = line.decode() + if re.search('^<\?xml version|^'+'\n') + xml_section_file.write(line) + elif re.search('^') + xml_section_file.close() + counter = 0 + process_xml_clinvar(pro_dir+'Clinvar.tsv') + else: + xml_section_file.write(line) + + xml.close() + os.remove(xml_section) + os.remove(pro_dir+outputn) + +def process_pfam(): + + print('Processing Pfam data...') + + pfam_file = 'Pfam-A.full.gz' + + print('-->Filtering Pfam...') + #filter records + inputf = gzip.open(dwn_dir+pfam_file,'rb') + outputn = pfam_file.replace('.full.gz','.full.filtered.gz') + outputf = gzip.open(pro_dir+outputn,'wb') + + for l in inputf: + if re.search('^#=GF ID|^#=GF AC|^#=GF DE|_HUMAN|^//',l.decode('latin-1')): + outputf.write(l) + + outputf.close() + + print('-->Creating Pfam file...') + #create tsv file + outputf = open(pro_dir+'Pfam-A.full.tsv','w') + outputf.write('\t'.join(['DOMAIN_ID','PFAM_ACC','DOMAIN_DESCRIPT','PROTEIN_NAME','PROTEIN_ACC','START','END'])+'\n') + + filei = gzip.open(pro_dir+outputn,'rb') + + (domain_id, pfam_acc, domain_des, protein_name, protein_acc, start, end) = ('', '', '', '', '', '', '') + + for line in filei: + line = line.decode() + line = line.strip('\n') + line_a = line.split(' ') + if line_a[0] == '//': + (domain_id, pfam_acc, domain_des, protein_name, protein_acc, start, end) = ('', '', '', '', '', '', '') + else: + if line_a[1] == 'ID': domain_id = ' '.join(line_a[4:]) + if line_a[1] == 'AC': pfam_acc = ' '.join(line_a[4:]) + if line_a[1] == 'DE': domain_des = ' '.join(line_a[4:]) + if line_a[0] == '#=GS' and len(line_a) > 10: + if 'AC' in line_a: + protein_name = line_a[1].split('_')[0] + protein_acc = line_a[line_a.index('AC')+1].split('.')[0] + start = line_a[1].split('/')[1].split('-')[0] + end = line_a[1].split('/')[1].split('-')[1] + + outputf.write('\t'.join([domain_id, pfam_acc, domain_des, protein_name, protein_acc, start, end])+'\n') + + filei.close() + outputf.close() + os.remove(pro_dir+outputn) + +def process_interpro(): + + print('Processing Interpro data...') + + interpro_file = 'match_complete.xml.gz' + + #filter records + print('-->Filtering Interpro...') + inputf = gzip.open(dwn_dir+interpro_file,'rb') + outputn = interpro_file.replace('.xml.gz','.xml.filtered.gz') + outputf = gzip.open(pro_dir+outputn,'wb') + + for line in inputf: + line = line.decode().rstrip() + if re.search('^': + outputf.write(bytes(line+'\n','utf-8')) + line = next(inputf).decode().rstrip() + outputf.write(bytes(line+'\n','utf-8')) + + inputf.close() + outputf.close() + + print('-->Creating Interpro file...') + + outputf = open(pro_dir+'Interpro.tsv','w') + outputf.write('\t'.join(['DOMAIN_ID','DOMAIN_DESCRIPT','PROTEIN_NAME','PROTEIN_ACC','START','END'])+'\n') + + inputf = gzip.open(pro_dir+outputn,'rb') + tree = ET.parse(inputf) + root = tree.getroot() + + for protein in root.iter('protein'): + (domain_id, pfam_acc, domain_des, protein_name, protein_acc, start, end) = ('', '', '', '', '', '', '') + protein_acc = protein.attrib['id'] + protein_name = protein.attrib['name'].split('_')[0] + + for match in protein.iter('match'): + for ipr in match.iter('ipr'): + domain_id = ipr.attrib['id'] + domain_des = ipr.attrib['name'] + for lcn in match.iter('lcn'): + start = lcn.attrib['start'] + end = lcn.attrib['end'] + outputf.write('\t'.join([domain_id, domain_des, protein_name, protein_acc, start, end])+'\n') + + outputf.close() + inputf.close() + os.remove(pro_dir+outputn) + +def process_uniprot(): + + print('Processing Uniprot data...') + + uniprot_file = 'uniprot_sprot.xml.gz' + + print('-->Filtering Uniprot...') + #filter records + inputf = gzip.open(dwn_dir+uniprot_file,'rb') + outputn = uniprot_file.replace('.xml.gz','.xml.filtered.gz') + outputf = gzip.open(pro_dir+outputn,'wb') + + for l in inputf: + l = l.decode() + if re.search('\?xml|||||||Creating Uniprot file...') + + outputf = open(pro_dir+'Uniprot.tsv','w') + outputf.write('\t'.join(['PROTEIN_ID','GENE_NAME','PUBMED_ID','dbSNP_ref'])+'\n') + + inputf = gzip.open(pro_dir+outputn,'rb') + tree = ET.parse(inputf) + root = tree.getroot() + +# for child in root: +# print(child.tag, child.attrib) + + for entry in root.iter('entry'): + (protein_id, gene_name, organism) = ([], '', '') + for accession in entry.iter('accession'): + protein_id.append(accession.text) + for gene in entry.iter('gene'): + for name in gene.iter('name'): + gene_name = name.text + for organism in entry.iter('organism'): + for name in organism.iter('name'): + organism = name.text + if organism == 'Human': + outputf.write('\t'.join([';'.join(protein_id),gene_name,'',''])+'\n') + + outputf.close() + os.remove(pro_dir+outputn) + +def process_hallmarks(): + + print('Processing Hallmarks data...') + + hallmarks_dwn = "hallmarks.xlsx" + hallmarks_file = "hallmarks.tsv" + + filei = pd.read_excel(dwn_dir+hallmarks_dwn) + filei['Genes_list'] = filei['Genes'].apply(lambda x: x.split(', ')) + filei = filei.explode('Genes_list') + filei.to_csv(pro_dir+hallmarks_file, index=False, sep='\t', header=True) + +process_DGIdb() +process_sabdab() +process_moalmanac() +process_GDSC() +process_KEGG_ATC() +process_cmap() +process_FDA() +process_FDA_label() +process_EMA() +process_ct() +process_cgc_oncovar() +process_KEGG_ind() +process_gene_names() +#correct manually NKX2.1 interpreted separately as NKX2 and 1 +process_civic() +process_oncoKB() +process_intogen() +process_depmap() +process_KEGG_pathways() +process_SL() +process_hallmarks() + +#exclusive for genomic annotation +process_cosmic() +process_clinvar() +process_pfam() +process_interpro() +process_uniprot() diff --git a/src/python_example.py b/src/python_example.py new file mode 100644 index 0000000..d29099a --- /dev/null +++ b/src/python_example.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +#This example uses the 'requests' library +#To install requests: +# sudo pip install requests +#If you don't have pip: +# see http://www.pip-installer.org/en/latest/installing.html +# or +# wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py +# sudo python get-pip.py + +import sys +import argparse +import json +import requests +import importlib + +def usage(): + print("Usage Examples:") + print("python python_example.py --help") + print("python python_example.py --genes='FLT3'") + print("python python_example.py --genes='FLT3,EGFR,KRAS'") + print("python python_example.py --genes='FLT3,EGFR' --interaction_sources='TALC,TEND'") + print("python python_example.py --genes='FLT3,EGFR' --gene_categories='KINASE'") + print("python python_example.py --genes='FLT3,EGFR' --interaction_types='inhibitor'") + print("python python_example.py --genes='FLT3,EGFR' --source_trust_levels='Expert curated'") + print("python python_example.py --genes='FLT3,EGFR' --antineoplastic_only") + print("python python_example.py --genes='FLT3,EGFR,KRAS' --interaction_sources='TALC,TEND,MyCancerGenome' --gene_categories='KINASE' --interaction_types='inhibitor' --antineoplastic_only") + sys.exit(0) + +def parse_args(): + parser = argparse.ArgumentParser(description = "A Python example for using the DGIdb API", epilog = "For complete API documentation refer to http://dgidb.org/api") + parser.add_argument("-g", "--genes", help="list of gene symbols(REQUIRED). Use official Entrez symbols for best results", dest="genes") + parser.add_argument("-is", "--interaction_sources", help="Limit results to those from particular data sources. e.g. 'DrugBank', 'PharmGKB', 'TALC', 'TEND', 'TTD', 'MyCancerGenome')", dest="interaction_sources") + parser.add_argument("-it", "--interaction_types", help="Limit results to interactions with drugs that have a particular mechanism of action. e.g. 'inhibitor', 'antibody', etc", dest="interaction_types") + parser.add_argument("-gc", "--gene_categories", help="Limit results to genes with a particular druggable gene type. e.g. 'KINASE', 'ION CHANNEL', etc", dest="gene_categories") + parser.add_argument("-stl", "--source_trust_levels", help="Limit results based on trust level of the interaction source. e.g. 'Expert curated' or 'Non-curated", dest = "source_trust_levels") + parser.add_argument("-ano", "--antineoplastic_only", help="Limit results to anti-cancer drugs only", dest="antineoplastic_only", action = 'store_true') + parser.add_argument("-u", "--usage", help="Usage examples", dest="usage", action = 'store_true') + return parser.parse_args() + +class DGIAPI: + 'API Example class for DGI API.' + #modificado en base a https://github.com/dgidb/dgidb/issues/324 + domain = 'https://dgidb.org/' + api_path = '/api/v1/interactions.json' + def __init__(self, args): + self.genes = args.genes + self.interaction_sources = args.interaction_sources + self.interaction_types = args.interaction_types + self.gene_categories = args.gene_categories + self.source_trust_levels = args.source_trust_levels + self.antineoplastic_only = args.antineoplastic_only + def run_workflow(self): + self.create_request() + self.post_request() + self.print_response() + def create_request(self): + self.request = "http://dgidb.org/api/v1/interactions.json?genes=FLT1&drug_types=antineoplastic&interaction_sources=TALC" + self.payload = {} + if(self.genes): + self.payload['genes'] = self.genes + if(self.interaction_sources): + self.payload['interaction_sources'] = self.interaction_sources + if(self.gene_categories): + self.payload['gene_categories'] = self.gene_categories + if(self.interaction_types): + self.payload['interaction_types'] = self.interaction_types + if(self.source_trust_levels): + self.payload['source_trust_levels'] = self.source_trust_levels + if(self.antineoplastic_only): + self.payload['drug_types'] = 'antineoplastic' + def post_request(self): + self.request = DGIAPI.domain + DGIAPI.api_path + self.response = requests.post(self.request, data = self.payload) + def print_response(self): + response = json.loads(self.response.content) + matches = response['matchedTerms'] + if(matches): + print("gene_name\tdrug_name\tinteraction_type\tsource\tgene_categories") + for match in matches: + gene = match['geneName'] + categories = match['geneCategories'] + categories.sort() + joined_categories = ",".join(categories) + for interaction in match['interactions']: + source = interaction['source'] + drug = interaction['drugName'] + interaction_type = interaction['interactionType'] + print(gene + "\t" + drug + "\t" + interaction_type + "\t" + source + "\t" + joined_categories.lower()) + for unmatched in response['unmatchedTerms']: + print("Unmatched search term: " + unmatched['searchTerm']) + print("Possible suggestions: " + ",".join(unmatched['suggestions'])) + +if __name__ == '__main__': + args = parse_args() + if(not args.genes or args.usage): + usage() + da = DGIAPI(args) + da.run_workflow()