Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speedup validation and creating derived formats #894

Open
wants to merge 11 commits into
base: devel
Choose a base branch
from
40 changes: 40 additions & 0 deletions Scripts/check-chars.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env perl
use warnings;
use strict;
use utf8;
my @INFILES = glob(shift);
binmode(STDOUT,'utf8');
binmode(STDERR,'utf8');

foreach my $file (@INFILES) {
chars($file);
}

# Check if $file contains bad characters
sub chars {
my $file = shift;
my %c;
my @bad = ();
my ($fName) = $file =~ m|([^/]+)$|
or die "FATAL ERROR: Bad file '$file'\n";
print STDERR "INFO: Char validation for $fName\n";
open(IN, '<:utf8', $file);
undef $/;
my $txt = <IN>;
undef %c;
for my $c (split(//, $txt)) {$c{$c}++}
for my $c (sort keys %c) {
if (ord($c) == hex('00A0') or #NO-BREAK SPACE
ord($c) == hex('2011') or #NON-BREAKING HYPHEN
ord($c) == hex('00AD') or #SOFT HYPHEN
ord($c) == hex('FFFD') or #REPLACEMENT CHAR
(ord($c) >= hex('2000') and ord($c) <= hex('200A')) or #NON-STANDARD SPACES
(ord($c) >= hex('E000') and ord($c) <= hex('F8FF')) #PUA
) {
my $message = sprintf("U+%X (%dx)", ord($c), $c{$c});
push(@bad, $message)
}
}
print STDERR "WARN: File $fName contains bad chars: " . join('; ', @bad) . "\n"
if @bad
}
11 changes: 10 additions & 1 deletion Scripts/get-includes.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,17 @@
exclude-result-prefixes="#all"
version="2.0">

<!--
DEFAULT VALUE: context-elements="" means no filtering - all xincudes are printed
set space-separated list of parent elements if only specific included files are necesary
eg context-elements="teiCorpus" for component files
or context-elements="particDesc classDecl" for header files
-->
<xsl:param name="context-elements"></xsl:param>
<xsl:variable name="celems" select="tokenize($context-elements)"/>

<xsl:output encoding="utf-8" method="text"/>
<xsl:template match="xi:include">
<xsl:template match="xi:include[not($celems) or parent::*[local-name() = $celems]]">
<xsl:value-of select="@href"/>
<xsl:text>&#10;</xsl:text>
</xsl:template>
Expand Down
42 changes: 34 additions & 8 deletions Scripts/parlamint2distro.pl
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ sub usage {
# Problem with Out of heap space with TR, NL, GB for ana
$SaxonX = "java -Xmx${procMemGB}g -jar $Bin/bin/saxon.jar";

# reduce paralelism for metadata processing, if listPerson and listOrg are large, then there can be problem with memory size
my $minProcThreads = $procThreads > 30 ? int($procThreads/3) : $procThreads;

# logger variable stores info how long takes certain parts of code, used by logger subrutine
my $logger = {
code => '',
Expand Down Expand Up @@ -176,6 +179,7 @@ sub usage {
$scriptTexts = "$Bin/parlamintp-tei2text.pl";
$scriptVerts = "$Bin/parlamintp-tei2vert.pl";
$scriptConls = "$Bin/parlamintp2conllu.pl";
$scriptMetas = "$Bin/parlamintp-tei2meta.pl";

$XX_template = "ParlaMint-XX";

Expand Down Expand Up @@ -330,14 +334,20 @@ sub usage {
`rm -fr $outSmpDir; mkdir $outSmpDir`;
if (-e $outTeiRoot) {
`$Saxon outDir=$outSmpDir -xsl:$scriptSample $outTeiRoot`;
my $outTeiSmpRoot = File::Spec->catfile($outSmpDir, (File::Spec->splitpath($outTeiRoot))[2]);
`$scriptTexts -jobs $procThreads -in $outSmpDir -out $outSmpDir`;
&dirify($outSmpDir);
`$scriptMetas -jobs $procThreads -inRoot $outTeiSmpRoot -out $outSmpDir`;
}
else {print STDERR "WARN: No TEI files for $countryCode samples (needed root file is $outTeiRoot)\n"}
if (-e $outAnaRoot) {
`$Saxon outDir=$outSmpDir -xsl:$scriptSample $outAnaRoot`;
#Make also derived files
my $outAnaSmpRoot = File::Spec->catfile($outSmpDir, (File::Spec->splitpath($outAnaRoot))[2]);
`$scriptTexts -jobs $procThreads -in $outSmpDir -out $outSmpDir` unless $outTeiRoot;
`$scriptVerts -jobs $procThreads -in $outSmpDir -out $outSmpDir`;
&dirify($outSmpDir);
`$scriptMetas -jobs $procThreads -inRoot $outAnaSmpRoot -out $outSmpDir` unless $outTeiRoot;
`$scriptVerts -jobs $minProcThreads -in $outSmpDir -out $outSmpDir`;
if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outSmpDir/$vertRegi.$regiExt`}
else {print STDERR "WARN: registry file $vertRegi not found\n"}
`$scriptConls -jobs $procThreads -in $outSmpDir -out $outSmpDir`
Expand All @@ -356,11 +366,20 @@ sub usage {
}
if (($procAll and $procValid) or (!$procAll and $procValid == 1)) {
print STDERR "INFO: ***Validating $countryCode TEI\n";
logger('Validating TEI');

die "FATAL ERROR: Can't find schema directory\n" unless $schemaDir and -e $schemaDir;
`$scriptValid $schemaDir $outSmpDir` if -e $outSmpDir;
`$scriptValid $schemaDir $outTeiDir` if -e $outTeiDir;
`$scriptValid $schemaDir $outAnaDir` if -e $outAnaDir;
if (-e $outSmpDir) {
logger('Validating TEI.sample');
`$scriptValid --procThreads $procThreads $schemaDir $outSmpDir`;
}
if (-e $outTeiDir) {
logger('Validating TEI');
`$scriptValid --procThreads $procThreads $schemaDir $outTeiDir`;
}
if (-e $outAnaDir) {
logger('Validating TEI.ana');
`$scriptValid --procThreads $procThreads $schemaDir $outAnaDir`;
}
}
if (($procAll and $procTxt) or (!$procAll and $procTxt == 1)) {
print STDERR "INFO: ***Making $countryCode text\n";
Expand All @@ -373,8 +392,14 @@ sub usage {
if ($MT) {$inReadme = "$docsDir/README-$MT.text.txt"}
else {$inReadme = "$docsDir/README.text.txt"}
&cp_readme($countryCode, $handleTxt, $Version, $inReadme, "$outTxtDir/00README.txt");
if (-e $outTeiDir) {`$scriptTexts -jobs $procThreads -in $outTeiDir -out $outTxtDir`}
elsif (-e $outAnaDir) {`$scriptTexts -jobs $procThreads -in $outAnaDir -out $outTxtDir`}
if (-e $outTeiDir) {
`$scriptTexts -jobs $procThreads -in $outTeiDir -out $outTxtDir`;
`$scriptMetas -jobs $procThreads -inRoot $outTeiRoot -out $outTxtDir`;
}
elsif (-e $outAnaDir) {
`$scriptTexts -jobs $procThreads -in $outAnaDir -out $outTxtDir`;
`$scriptMetas -jobs $procThreads -in $outAnaRoot -out $outTxtDir`;
}
else {die "FATAL ERROR: Neither $outTeiDir nor $outAnaDir exits\n"}
&dirify($outTxtDir);
}
Expand All @@ -388,6 +413,7 @@ sub usage {
else {$inReadme = "$docsDir/README.conll.txt"}
&cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outConlDir/00README.txt");
`$scriptConls -jobs $procThreads -in $outAnaDir -out $outConlDir`;
`$scriptMetas -jobs $procThreads -inRoot $outAnaRoot -out $outConlDir`;
&dirify($outConlDir);
}
if (($procAll and $procVert) or (!$procAll and $procVert == 1)) {
Expand All @@ -401,7 +427,7 @@ sub usage {
&cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outVertDir/00README.txt");
if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outVertDir/$vertRegi.$regiExt`}
else {print STDERR "WARN: registry file $vertRegi not found\n"}
`$scriptVerts -jobs $procThreads -in $outAnaDir -out $outVertDir`;
`$scriptVerts -jobs $minProcThreads -in $outAnaDir -out $outVertDir`;
&dirify($outVertDir);
}
logger();
Expand Down
76 changes: 76 additions & 0 deletions Scripts/parlamintp-tei2meta.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env perl
use warnings;
use utf8;

sub usage
{
print STDERR ("Usage: parlamintp-tei2meta.pl -jobs <Jobs> -root <corpusRoot> -out <OutputDirectory>\n");
print STDERR (" Converts ParlaMint component files in the <corpusRoot> to\n");
print STDERR (" -meta.tsv files in the <OutputDirectory>\n");
print STDERR (" using parallel <Jobs> in execution.\n");
}

use Getopt::Long;
use FindBin qw($Bin);
use File::Spec;
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory

my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);

GetOptions
(
'help' => \$help,
'inRoot=s' => \$inRoot,
'out=s' => \$outDir,
'jobs=i' => \$procThreads,
);

if ($help) {
&usage;
exit;
}

$inRoot = File::Spec->rel2abs($inRoot) if $inRoot;
$inDir = (File::Spec->splitpath($inRoot))[1];
$outDir = File::Spec->rel2abs($outDir) if $outDir;
$procThreads = 1 unless $procThreads;

binmode(STDERR, 'utf8');

$Para = "parallel --gnu --halt 0 --jobs $procThreads";
$Saxon = "java -jar $Bin/bin/saxon.jar";
$scriptMeta = "$Bin/parlamint2meta.xsl";
$Includes = "$Bin/get-includes.xsl";

`rm -f $outDir/*-meta.tsv`;

#Store all files to be processed in $fileFile
$fileFile = "$DIR/files.lst";
`$Saxon -xsl:$Includes context-elements="teiCorpus" $inRoot | sed "s#^#$inDir/#" > $fileFile`;

#Is this an MTed corpus?
$MT = $inDir =~ m/-en/;

print STDERR "INFO: Making metadata files from component files in $inRoot\n";
#For MTed corpora output only en metadata, for native, both xx and en
if ($MT) {@outLangs = ('en')} else {@outLangs = ('xx', 'en')}
# For orig corpora make ParlaMint-XX-meta.tsv in corpus language and ParlaMint-XX-meta-en.tsv in English
# For MTed corpora we produce ParlaMint-XX-en-meta.tsv in English
foreach my $outLang (@outLangs) {
my $outSuffix;
if ($MT and $outLang eq 'xx') {}
elsif ($MT and $outLang eq 'en') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'xx') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'en') {$outSuffix = "-meta-en.tsv"}
if ($outSuffix) {
$command = "$Saxon" .
" meta=$inRoot" .
" out-lang=$outLang" .
" -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix";
`cat $fileFile | $Para '$command'`;
# The rm following looks like a bug, as no TSV files are left if we are processing only .ana!
#`rm -f $outDir/*.ana-meta.tsv`;
}
}
`rename 's/\.ana//' $outDir/*-meta*.tsv`;
30 changes: 2 additions & 28 deletions Scripts/parlamintp-tei2text.pl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ sub usage
use FindBin qw($Bin);
use File::Spec;
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory

my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);

Expand All @@ -37,18 +38,16 @@ sub usage
binmode(STDERR, 'utf8');

$Para = "parallel --gnu --halt 0 --jobs $procThreads";

$Saxon = "java -jar $Bin/bin/saxon.jar";

$scriptMeta = "$Bin/parlamint2meta.xsl";
$scriptText = "$Bin/parlamint-tei2text.xsl";

print STDERR "INFO: Converting directory $inDir\n";

#Store all files to be processed in $fileFile
$fileFile = "$DIR/files.lst";
$corpusFiles = "$inDir/*_*.xml $inDir/*/*_*.xml";
#Is this an MTed corpus?
$MT = $inDir =~ m/-en/;

#We can convert either plain files or .ana files
open(TMP, '>:utf8', $fileFile);
Expand All @@ -72,28 +71,3 @@ sub usage
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*.txt`;

print STDERR "INFO: Making metadata files\n";
opendir(CORPUSDIR, $inDir);
@rootFile = grep {/ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?(\.ana)?\.xml$/} readdir(CORPUSDIR);
closedir(CORPUSDIR);
#For MTed corpora output only en metadata, for native, both xx and en
if ($MT) {@outLangs = ('en')} else {@outLangs = ('xx', 'en')}
# For orig corpora make ParlaMint-XX-meta.tsv in corpus language and ParlaMint-XX-meta-en.tsv in English
# For MTed corpora we produce ParlaMint-XX-en-meta.tsv in English
foreach my $outLang (@outLangs) {
my $outSuffix;
if ($MT and $outLang eq 'xx') {}
elsif ($MT and $outLang eq 'en') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'xx') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'en') {$outSuffix = "-meta-en.tsv"}
if ($outSuffix) {
$command = "$Saxon" .
" meta=" . File::Spec->catfile($inDir,$rootFile[0]) .
" out-lang=$outLang" .
" -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix";
`cat $fileFile | $Para '$command'`;
# The rm following looks like a bug, as no TSV files are left if we are processing only .ana!
#`rm -f $outDir/*.ana-meta.tsv`;
}
}
`rename 's/\.ana//' $outDir/*-meta*.tsv`;
2 changes: 2 additions & 0 deletions Scripts/parlamintp-tei2vert.pl
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ sub usage
use FindBin qw($Bin);
use File::Spec;
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
use Getopt::Long;
my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);


GetOptions
(
'help' => \$help,
Expand Down
23 changes: 2 additions & 21 deletions Scripts/parlamintp2conllu.pl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ sub usage
my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);


GetOptions
(
'help' => \$help,
Expand All @@ -46,11 +47,11 @@ sub usage
$procThreads = 1 unless $procThreads;

$Para = "parallel --gnu --halt 0 --jobs $procThreads";

$Saxon = "java -jar $Bin/bin/saxon.jar";
$scriptValid = "$Bin/bin/tools/validate.py";

$scriptConvert = "$Bin/parlamint2conllu.xsl";
$scriptMeta = "$Bin/parlamint2meta.xsl";

#This should be somehow factorised out!!
$country2lang{'AT'} = 'de';
Expand Down Expand Up @@ -119,28 +120,8 @@ sub usage
close TMP;

`mkdir $outDir` unless -e "$outDir";
`rm -f $outDir/*-meta.tsv`;
`rm -f $outDir/*.conllu`;

#For MTed corpora output only en metadata, for native, both xx and en
if ($MT) {@outLangs = ('en')} else {@outLangs = ('xx', 'en')}
# For orig corpora make ParlaMint-XX-meta.tsv in corpus language and ParlaMint-XX-meta-en.tsv in English
# For MTed corpora we produce ParlaMint-XX-en-meta.tsv in English
foreach my $outLang (@outLangs) {
my $outSuffix;
if ($MT and $outLang eq 'xx') {}
elsif ($MT and $outLang eq 'en') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'xx') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'en') {$outSuffix = "-meta-en.tsv"}
if ($outSuffix) {
$command = "$Saxon meta=$rootAnaFile" .
" out-lang=$outLang" .
" -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix";
`cat $fileFile | $Para '$command'`;
}
}
`rename 's/\.ana//' $outDir/*-meta*.tsv`;

# Produce common CoNLL-U, even if we have more languages in a corpus
if ($langs !~ /,/) {$checkLang = $langs}
else {($checkLang) = $langs =~ /(.+?),/}
Expand Down
Loading