From ecf009854e56cb42f3ba6991a9d9d70d3fc29ba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Mon, 20 Jan 2025 10:02:31 +0100 Subject: [PATCH 1/9] move character validation to separate file #846 --- Scripts/check-chars.pl | 40 ++++++++++++++++++++++++++++ Scripts/validate-parlamint.pl | 49 ++++++++++------------------------- 2 files changed, 53 insertions(+), 36 deletions(-) create mode 100755 Scripts/check-chars.pl diff --git a/Scripts/check-chars.pl b/Scripts/check-chars.pl new file mode 100755 index 000000000..539711baa --- /dev/null +++ b/Scripts/check-chars.pl @@ -0,0 +1,40 @@ +#!/usr/bin/env perl +use warnings; +use strict; +use utf8; +my @INFILES = glob(shift); +binmode(STDOUT,'utf8'); +binmode(STDERR,'utf8'); + +foreach my $file (@INFILES) { + chars($file); +} + +# Check if $file contains bad characters +sub chars { + my $file = shift; + my %c; + my @bad = (); + my ($fName) = $file =~ m|([^/]+)$| + or die "FATAL ERROR: Bad file '$file'\n"; + print STDERR "INFO: Char validation for $fName\n"; + open(IN, '<:utf8', $file); + undef $/; + my $txt = ; + undef %c; + for my $c (split(//, $txt)) {$c{$c}++} + for my $c (sort keys %c) { + if (ord($c) == hex('00A0') or #NO-BREAK SPACE + ord($c) == hex('2011') or #NON-BREAKING HYPHEN + ord($c) == hex('00AD') or #SOFT HYPHEN + ord($c) == hex('FFFD') or #REPLACEMENT CHAR + (ord($c) >= hex('2000') and ord($c) <= hex('200A')) or #NON-STANDARD SPACES + (ord($c) >= hex('E000') and ord($c) <= hex('F8FF')) #PUA + ) { + my $message = sprintf("U+%X (%dx)", ord($c), $c{$c}); + push(@bad, $message) + } + } + print STDERR "WARN: File $fName contains bad chars: " . join('; ', @bad) . "\n" + if @bad +} \ No newline at end of file diff --git a/Scripts/validate-parlamint.pl b/Scripts/validate-parlamint.pl index f0afe9804..67a6ea8fe 100755 --- a/Scripts/validate-parlamint.pl +++ b/Scripts/validate-parlamint.pl @@ -40,6 +40,7 @@ sub usage $Compose = "$Bin/parlamint-composite-teiHeader.xsl"; $Links = "$Bin/check-links.xsl"; +$Chars = "$Bin/check-chars.pl"; $Valid = "$Bin/validate-parlamint.xsl"; $Valid_particDesc = "$Bin/validate-parlamint-particDesc.xsl"; $Includes = "$Bin/get-includes.xsl"; @@ -87,7 +88,7 @@ sub validate { my $interfix = $type; $interfix =~ s/^TEI//; print STDERR "INFO: Validating $type root $rootFile\n"; - &chars($rootFile); + &run($Chars, $rootFile); &run("$Jing $schemaDir/ParlaMint-teiCorpus$interfix.rng", $rootFile); &run("$Saxon outDir=$tmpDir -xsl:$Compose", $rootFile); &run("$Jing $schemaDir/ParlaMint.odd.rng", "$tmpDir/$fileName"); @@ -100,12 +101,12 @@ sub validate { if (-e $file) { if($file =~ m/ParlaMint-(?:[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?)?.?(taxonomy|listPerson|listOrg).*\.xml/){ print STDERR "INFO: Validating file included in teiHeader $file\n"; - &chars($file); + &run($Chars, $file); &run("$Jing $schemaDir/ParlaMint-$1.rng", $file); &run("$Saxon meta=$rootFile -xsl:$Links", $file); } else { print STDERR "INFO: Validating component $type file $file\n"; - &chars($file); + &run($Chars, $file); &run("$Jing $schemaDir/ParlaMint-TEI$interfix.rng", $file); &run("$Jing $schemaDir/ParlaMint.odd.rng", $file); &run("$Saxon -xsl:$Valid", $file); @@ -116,54 +117,30 @@ sub validate { } } -# Check if $file contains bad characters -sub chars { - my $file = shift; - my %c; - my @bad = (); - my ($fName) = $file =~ m|([^/]+)$| - or die "FATAL ERROR: Bad file '$file'\n"; - print STDERR "INFO: Char validation for $fName\n"; - open(IN, '<:utf8', $file); - undef $/; - my $txt = ; - undef %c; - for $c (split(//, $txt)) {$c{$c}++} - for $c (sort keys %c) { - if (ord($c) == hex('00A0') or #NO-BREAK SPACE - ord($c) == hex('2011') or #NON-BREAKING HYPHEN - ord($c) == hex('00AD') or #SOFT HYPHEN - ord($c) == hex('FFFD') or #REPLACEMENT CHAR - (ord($c) >= hex('2000') and ord($c) <= hex('200A')) or #NON-STANDARD SPACES - (ord($c) >= hex('E000') and ord($c) <= hex('F8FF')) #PUA - ) { - $message = sprintf("U+%X (%dx)", ord($c), $c{$c}); - push(@bad, $message) - } - } - print STDERR "WARN: File $fName contains bad chars: " . join('; ', @bad) . "\n" - if @bad -} - + sub run { my $command = shift; my $file = shift; my ($fName) = $file =~ m|([^/]+)$| or die "FATAL ERROR: Bad file '$file'\n"; + my $msg = ''; if ($command =~ /$Jing/) { - print STDERR "INFO: XML validation for $fName\n" + $msg = "INFO: XML validation for $fName\n" } elsif ($command =~ /$Compose/) { } + elsif ($command =~ /$Chars/) { + } elsif ($command =~ /$Valid/) { - print STDERR "INFO: Content validaton for $fName\n" + $msg = "INFO: Content validaton for $fName\n" } elsif ($command =~ /$Valid_particDesc/) { - print STDERR "INFO: particDesc content validaton for $fName\n" + $msg = "INFO: particDesc content validaton for $fName\n" } elsif ($command =~ /$Links/) { - print STDERR "INFO: Link checking for $fName\n" + $msg = "INFO: Link checking for $fName\n" } else {die "FATAL ERROR: Weird command $command!\n"} + print STDERR $msg; `$command $file 1>&2`; } From ddef908c6045027c516e7d229f689e5f61941d3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Mon, 20 Jan 2025 17:19:43 +0100 Subject: [PATCH 2/9] parallelize validation #846 --- Scripts/parlamint2distro.pl | 17 ++++++-- Scripts/validate-parlamint.pl | 75 ++++++++++++++++++++++++----------- 2 files changed, 64 insertions(+), 28 deletions(-) diff --git a/Scripts/parlamint2distro.pl b/Scripts/parlamint2distro.pl index 54a7b5e8a..885ee5676 100755 --- a/Scripts/parlamint2distro.pl +++ b/Scripts/parlamint2distro.pl @@ -356,11 +356,20 @@ sub usage { } if (($procAll and $procValid) or (!$procAll and $procValid == 1)) { print STDERR "INFO: ***Validating $countryCode TEI\n"; - logger('Validating TEI'); + die "FATAL ERROR: Can't find schema directory\n" unless $schemaDir and -e $schemaDir; - `$scriptValid $schemaDir $outSmpDir` if -e $outSmpDir; - `$scriptValid $schemaDir $outTeiDir` if -e $outTeiDir; - `$scriptValid $schemaDir $outAnaDir` if -e $outAnaDir; + if (-e $outSmpDir) { + logger('Validating TEI.sample'); + `$scriptValid --procThreads $procThreads $schemaDir $outSmpDir`; + } + if (-e $outTeiDir) { + logger('Validating TEI'); + `$scriptValid --procThreads $procThreads $schemaDir $outTeiDir`; + } + if (-e $outAnaDir) { + logger('Validating TEI.ana'); + `$scriptValid --procThreads $procThreads $schemaDir $outAnaDir`; + } } if (($procAll and $procTxt) or (!$procAll and $procTxt == 1)) { print STDERR "INFO: ***Making $countryCode text\n"; diff --git a/Scripts/validate-parlamint.pl b/Scripts/validate-parlamint.pl index 67a6ea8fe..e8b5fba4b 100755 --- a/Scripts/validate-parlamint.pl +++ b/Scripts/validate-parlamint.pl @@ -32,9 +32,17 @@ sub usage use FindBin qw($Bin); use File::Spec; +my $procThreads = 1; + +GetOptions + ( + 'procThreads=i'=> \$procThreads, +); + $schemaDir = File::Spec->rel2abs(shift); $inDirs = File::Spec->rel2abs(shift); +$Parallel = "parallel --keep-order --gnu --halt 2 --jobs $procThreads"; $Jing = "java -jar $Bin/bin/jing.jar"; $Saxon = "java -jar $Bin/bin/saxon.jar"; @@ -45,6 +53,7 @@ sub usage $Valid_particDesc = "$Bin/validate-parlamint-particDesc.xsl"; $Includes = "$Bin/get-includes.xsl"; + foreach my $inDir (glob "$inDirs") { next unless -d $inDir; print STDERR "INFO: Validating directory $inDir\n"; @@ -88,59 +97,77 @@ sub validate { my $interfix = $type; $interfix =~ s/^TEI//; print STDERR "INFO: Validating $type root $rootFile\n"; - &run($Chars, $rootFile); - &run("$Jing $schemaDir/ParlaMint-teiCorpus$interfix.rng", $rootFile); - &run("$Saxon outDir=$tmpDir -xsl:$Compose", $rootFile); - &run("$Jing $schemaDir/ParlaMint.odd.rng", "$tmpDir/$fileName"); - &run("$Saxon -xsl:$Valid", $rootFile); - &run("$Saxon -xsl:$Valid_particDesc", $rootFile); - &run("$Saxon -xsl:$Links", $rootFile); + &run($Chars, $rootFile, 1); + &run("$Jing $schemaDir/ParlaMint-teiCorpus$interfix.rng", $rootFile, 1); + &run("$Saxon outDir=$tmpDir -xsl:$Compose", $rootFile, 1); + &run("$Jing $schemaDir/ParlaMint.odd.rng", "$tmpDir/$fileName", 1); + &run("$Saxon -xsl:$Valid", $rootFile, 1); + &run("$Saxon -xsl:$Valid_particDesc", $rootFile, 1); + &run("$Saxon -xsl:$Links", $rootFile, 1); @includes = split(/\n/, `$Saxon -xsl:$Includes $rootFile`); + open(TASKS, '>:utf8', "$tmpDir/$fileName.validate-included.lst") if $procThreads > 1; + my $runNow = !($procThreads > 1); while (my $f = shift @includes) { $file = "$inDir/$f"; + my $fileTasks = ''; if (-e $file) { if($file =~ m/ParlaMint-(?:[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?)?.?(taxonomy|listPerson|listOrg).*\.xml/){ - print STDERR "INFO: Validating file included in teiHeader $file\n"; - &run($Chars, $file); - &run("$Jing $schemaDir/ParlaMint-$1.rng", $file); - &run("$Saxon meta=$rootFile -xsl:$Links", $file); + $fileTasks .= &printMsg("INFO: Validating file included in teiHeader $file",$runNow); + $fileTasks .= &run($Chars, $file, $runNow); + $fileTasks .= &run("$Jing $schemaDir/ParlaMint-$1.rng", $file, $runNow); + $fileTasks .= &run("$Saxon meta=$rootFile -xsl:$Links", $file, $runNow); } else { - print STDERR "INFO: Validating component $type file $file\n"; - &run($Chars, $file); - &run("$Jing $schemaDir/ParlaMint-TEI$interfix.rng", $file); - &run("$Jing $schemaDir/ParlaMint.odd.rng", $file); - &run("$Saxon -xsl:$Valid", $file); - &run("$Saxon meta=$rootFile -xsl:$Links", $file); + $fileTasks .= &printMsg("INFO: Validating component $type file $file",$runNow); + $fileTasks .= &run($Chars, $file, $runNow); + $fileTasks .= &run("$Jing $schemaDir/ParlaMint-TEI$interfix.rng", $file, $runNow); + $fileTasks .= &run("$Jing $schemaDir/ParlaMint.odd.rng", $file, $runNow); + $fileTasks .= &run("$Saxon -xsl:$Valid", $file, $runNow); + $fileTasks .= &run("$Saxon meta=$rootFile -xsl:$Links", $file, $runNow); } } - else {print STDERR "ERROR: $rootFile XIncluded file $file does not exist!\n"} + else {$fileTasks .= &printMsg("ERROR: $rootFile XIncluded file $file does not exist!",$runNow)} + print TASKS "$fileTasks\n" unless $runNow ; } + close TASKS if $procThreads > 1; + `cat "$tmpDir/$fileName.validate-included.lst"| $Parallel "{}"` unless $runNow; } +sub printMsg { + my $msg = shift; + my $runNow = shift; + my $cmd = "echo -n \"$msg\\n\" 1>&2"; + `$cmd` if $runNow; + return "$cmd ;"; +} sub run { my $command = shift; my $file = shift; + my $runNow = shift; my ($fName) = $file =~ m|([^/]+)$| or die "FATAL ERROR: Bad file '$file'\n"; my $msg = ''; + my $cmd = ''; if ($command =~ /$Jing/) { - $msg = "INFO: XML validation for $fName\n" + $msg = "INFO: XML validation for $fName\\n" } elsif ($command =~ /$Compose/) { } elsif ($command =~ /$Chars/) { } elsif ($command =~ /$Valid/) { - $msg = "INFO: Content validaton for $fName\n" + $msg = "INFO: Content validaton for $fName\\n" } elsif ($command =~ /$Valid_particDesc/) { - $msg = "INFO: particDesc content validaton for $fName\n" + $msg = "INFO: particDesc content validaton for $fName\\n" } elsif ($command =~ /$Links/) { - $msg = "INFO: Link checking for $fName\n" + $msg = "INFO: Link checking for $fName\\n" } else {die "FATAL ERROR: Weird command $command!\n"} - print STDERR $msg; - `$command $file 1>&2`; + $cmd .= "echo -n \"$msg\" 1>&2;" if $msg; + $cmd .= "$command $file 1>&2"; + #print STDERR "### $cmd ###\n"; + `$cmd` if $runNow; + return "$cmd ;"; } From cc94650406d3ad497b8217e26e8080d8e03c5f34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Thu, 23 Jan 2025 09:19:04 +0100 Subject: [PATCH 3/9] customize number of jobs in parallel #895 --- Scripts/parlamint2distro.pl | 8 ++++---- Scripts/parlamintp-tei2text.pl | 9 ++++++++- Scripts/parlamintp-tei2vert.pl | 9 ++++++++- Scripts/parlamintp2conllu.pl | 9 ++++++++- 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Scripts/parlamint2distro.pl b/Scripts/parlamint2distro.pl index 885ee5676..f075b8628 100755 --- a/Scripts/parlamint2distro.pl +++ b/Scripts/parlamint2distro.pl @@ -382,8 +382,8 @@ sub usage { if ($MT) {$inReadme = "$docsDir/README-$MT.text.txt"} else {$inReadme = "$docsDir/README.text.txt"} &cp_readme($countryCode, $handleTxt, $Version, $inReadme, "$outTxtDir/00README.txt"); - if (-e $outTeiDir) {`$scriptTexts $outTeiDir $outTxtDir`} - elsif (-e $outAnaDir) {`$scriptTexts $outAnaDir $outTxtDir`} + if (-e $outTeiDir) {`$scriptTexts --procThreads $procThreads $outTeiDir $outTxtDir`} + elsif (-e $outAnaDir) {`$scriptTexts --procThreads $procThreads $outAnaDir $outTxtDir`} else {die "FATAL ERROR: Neither $outTeiDir nor $outAnaDir exits\n"} &dirify($outTxtDir); } @@ -396,7 +396,7 @@ sub usage { if ($MT) {$inReadme = "$docsDir/README-$MT.conll.txt"} else {$inReadme = "$docsDir/README.conll.txt"} &cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outConlDir/00README.txt"); - `$scriptConls $outAnaDir $outConlDir`; + `$scriptConls --procThreads $procThreads $outAnaDir $outConlDir`; &dirify($outConlDir); } if (($procAll and $procVert) or (!$procAll and $procVert == 1)) { @@ -410,7 +410,7 @@ sub usage { &cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outVertDir/00README.txt"); if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outVertDir/$vertRegi.$regiExt`} else {print STDERR "WARN: registry file $vertRegi not found\n"} - `$scriptVerts $outAnaDir $outVertDir`; + `$scriptVerts --procThreads $procThreads $outAnaDir $outVertDir`; &dirify($outVertDir); } logger(); diff --git a/Scripts/parlamintp-tei2text.pl b/Scripts/parlamintp-tei2text.pl index 711c50e76..8c7fbe625 100755 --- a/Scripts/parlamintp-tei2text.pl +++ b/Scripts/parlamintp-tei2text.pl @@ -8,12 +8,19 @@ my $tempdirroot = "$Bin/tmp"; my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1); +my $procThreads = 10; + +GetOptions + ( + 'procThreads=i'=> \$procThreads, +); + $inDir = File::Spec->rel2abs(shift); $outDir = File::Spec->rel2abs(shift); binmode(STDERR, 'utf8'); -$Para = 'parallel --gnu --halt 2 --jobs 10'; +$Para = "parallel --gnu --halt 2 --jobs $procThreads"; $Saxon = "java -jar $Bin/bin/saxon.jar"; $scriptMeta = "$Bin/parlamint2meta.xsl"; diff --git a/Scripts/parlamintp-tei2vert.pl b/Scripts/parlamintp-tei2vert.pl index e3d06c532..aed3d86c3 100755 --- a/Scripts/parlamintp-tei2vert.pl +++ b/Scripts/parlamintp-tei2vert.pl @@ -7,12 +7,19 @@ my $tempdirroot = "$Bin/tmp"; my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1); +my $procThreads = 10; + +GetOptions + ( + 'procThreads=i'=> \$procThreads, +); + $inDir = File::Spec->rel2abs(shift); $outDir = File::Spec->rel2abs(shift); binmode(STDERR, 'utf8'); -$Para = 'parallel --gnu --halt 2 --jobs 10'; +$Para = "parallel --gnu --halt 2 --jobs $procThreads"; $Saxon = "java -jar $Bin/bin/saxon.jar"; $TEI2VERT = "$Bin/parlamint2xmlvert.xsl"; diff --git a/Scripts/parlamintp2conllu.pl b/Scripts/parlamintp2conllu.pl index 6acde6f8e..7c5f5c5f7 100755 --- a/Scripts/parlamintp2conllu.pl +++ b/Scripts/parlamintp2conllu.pl @@ -25,10 +25,17 @@ sub usage my $tempdirroot = "$Bin/tmp"; my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1); +my $procThreads = 10; + +GetOptions + ( + 'procThreads=i'=> \$procThreads, +); + $inDir = File::Spec->rel2abs(shift); $outDir = File::Spec->rel2abs(shift); -$Para = 'parallel --gnu --halt 0 --jobs 10'; +$Para = "parallel --gnu --halt 0 --jobs $procThreads"; $Saxon = "java -jar $Bin/bin/saxon.jar"; $scriptValid = "$Bin/bin/tools/validate.py"; From 85c46562b0c2c3e5fe71f61a964d878421da279e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Tue, 28 Jan 2025 09:06:31 +0100 Subject: [PATCH 4/9] add missing getopt ##895 --- Scripts/parlamintp-tei2text.pl | 1 + Scripts/parlamintp-tei2vert.pl | 1 + Scripts/parlamintp2conllu.pl | 1 + 3 files changed, 3 insertions(+) diff --git a/Scripts/parlamintp-tei2text.pl b/Scripts/parlamintp-tei2text.pl index 8c7fbe625..5a5c6829c 100755 --- a/Scripts/parlamintp-tei2text.pl +++ b/Scripts/parlamintp-tei2text.pl @@ -5,6 +5,7 @@ use File::Spec; use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory +use Getopt::Long; my $tempdirroot = "$Bin/tmp"; my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1); diff --git a/Scripts/parlamintp-tei2vert.pl b/Scripts/parlamintp-tei2vert.pl index aed3d86c3..9455ea58f 100755 --- a/Scripts/parlamintp-tei2vert.pl +++ b/Scripts/parlamintp-tei2vert.pl @@ -4,6 +4,7 @@ use FindBin qw($Bin); use File::Spec; use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory +use Getopt::Long; my $tempdirroot = "$Bin/tmp"; my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1); diff --git a/Scripts/parlamintp2conllu.pl b/Scripts/parlamintp2conllu.pl index 7c5f5c5f7..0ce0d0a53 100755 --- a/Scripts/parlamintp2conllu.pl +++ b/Scripts/parlamintp2conllu.pl @@ -19,6 +19,7 @@ sub usage print STDERR (" .conllu and -meta.tsv files in the \n"); print STDERR (" Also validates the .conllu agains UD validations script\n"); } +use Getopt::Long; use FindBin qw($Bin); use File::Spec; use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory From 96df102177eb4bf287dd35605dc4e4401ce07941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Tue, 28 Jan 2025 16:33:43 +0100 Subject: [PATCH 5/9] set halt 0 #895 --- Scripts/parlamintp-tei2text.pl | 2 +- Scripts/parlamintp-tei2vert.pl | 2 +- Scripts/validate-parlamint.pl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Scripts/parlamintp-tei2text.pl b/Scripts/parlamintp-tei2text.pl index 5a5c6829c..7e9536a87 100755 --- a/Scripts/parlamintp-tei2text.pl +++ b/Scripts/parlamintp-tei2text.pl @@ -21,7 +21,7 @@ binmode(STDERR, 'utf8'); -$Para = "parallel --gnu --halt 2 --jobs $procThreads"; +$Para = "parallel --gnu --halt 0 --jobs $procThreads"; $Saxon = "java -jar $Bin/bin/saxon.jar"; $scriptMeta = "$Bin/parlamint2meta.xsl"; diff --git a/Scripts/parlamintp-tei2vert.pl b/Scripts/parlamintp-tei2vert.pl index 9455ea58f..e96252961 100755 --- a/Scripts/parlamintp-tei2vert.pl +++ b/Scripts/parlamintp-tei2vert.pl @@ -20,7 +20,7 @@ binmode(STDERR, 'utf8'); -$Para = "parallel --gnu --halt 2 --jobs $procThreads"; +$Para = "parallel --gnu --halt 0 --jobs $procThreads"; $Saxon = "java -jar $Bin/bin/saxon.jar"; $TEI2VERT = "$Bin/parlamint2xmlvert.xsl"; diff --git a/Scripts/validate-parlamint.pl b/Scripts/validate-parlamint.pl index e8b5fba4b..7fa6db341 100755 --- a/Scripts/validate-parlamint.pl +++ b/Scripts/validate-parlamint.pl @@ -42,7 +42,7 @@ sub usage $schemaDir = File::Spec->rel2abs(shift); $inDirs = File::Spec->rel2abs(shift); -$Parallel = "parallel --keep-order --gnu --halt 2 --jobs $procThreads"; +$Parallel = "parallel --keep-order --gnu --halt 0 --jobs $procThreads"; $Jing = "java -jar $Bin/bin/jing.jar"; $Saxon = "java -jar $Bin/bin/saxon.jar"; From 5ff8fc20a638da2d051393a5e33e2af7a596af58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Thu, 30 Jan 2025 09:43:17 +0100 Subject: [PATCH 6/9] reduce paralelism, when meta script is run (not enough memory) #895 --- Scripts/parlamintp-tei2text.pl | 3 ++- Scripts/parlamintp2conllu.pl | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Scripts/parlamintp-tei2text.pl b/Scripts/parlamintp-tei2text.pl index 7e9536a87..3b5ea5d3e 100755 --- a/Scripts/parlamintp-tei2text.pl +++ b/Scripts/parlamintp-tei2text.pl @@ -22,6 +22,7 @@ binmode(STDERR, 'utf8'); $Para = "parallel --gnu --halt 0 --jobs $procThreads"; +$ParaLess = "parallel --gnu --halt 0 --jobs ".int($procThreads / 3); $Saxon = "java -jar $Bin/bin/saxon.jar"; $scriptMeta = "$Bin/parlamint2meta.xsl"; @@ -75,7 +76,7 @@ " meta=" . File::Spec->catfile($inDir,$rootFile[0]) . " out-lang=$outLang" . " -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix"; - `cat $fileFile | $Para '$command'`; + `cat $fileFile | $ParaLess '$command'`; # The rm following looks like a bug, as no TSV files are left if we are processing only .ana! #`rm -f $outDir/*.ana-meta.tsv`; } diff --git a/Scripts/parlamintp2conllu.pl b/Scripts/parlamintp2conllu.pl index 0ce0d0a53..f3e5c87c5 100755 --- a/Scripts/parlamintp2conllu.pl +++ b/Scripts/parlamintp2conllu.pl @@ -37,6 +37,7 @@ sub usage $outDir = File::Spec->rel2abs(shift); $Para = "parallel --gnu --halt 0 --jobs $procThreads"; +$ParaLess = "parallel --gnu --halt 0 --jobs ".int($procThreads / 3); $Saxon = "java -jar $Bin/bin/saxon.jar"; $scriptValid = "$Bin/bin/tools/validate.py"; @@ -127,7 +128,7 @@ sub usage $command = "$Saxon meta=$rootAnaFile" . " out-lang=$outLang" . " -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix"; - `cat $fileFile | $Para '$command'`; + `cat $fileFile | $ParaLess '$command'`; } } `rename 's/\.ana//' $outDir/*-meta*.tsv`; From 76963a839ecd3939daa05e7fed2311c91e8f4b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Tue, 4 Feb 2025 15:17:52 +0100 Subject: [PATCH 7/9] reduce paralelism in vert #895 --- Scripts/parlamint2distro.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Scripts/parlamint2distro.pl b/Scripts/parlamint2distro.pl index 16f23de11..a64510523 100755 --- a/Scripts/parlamint2distro.pl +++ b/Scripts/parlamint2distro.pl @@ -108,6 +108,9 @@ sub usage { # Problem with Out of heap space with TR, NL, GB for ana $SaxonX = "java -Xmx${procMemGB}g -jar $Bin/bin/saxon.jar"; +# reduce paralelism for metadata processing, if listPerson and listOrg are large, then there can be problem with memory size +my $minProcThreads = $procThreads > 30 ? int($procThreads/3) : $procThreads; + # logger variable stores info how long takes certain parts of code, used by logger subrutine my $logger = { code => '', @@ -337,7 +340,7 @@ sub usage { `$Saxon outDir=$outSmpDir -xsl:$scriptSample $outAnaRoot`; #Make also derived files `$scriptTexts -jobs $procThreads -in $outSmpDir -out $outSmpDir` unless $outTeiRoot; - `$scriptVerts -jobs $procThreads -in $outSmpDir -out $outSmpDir`; + `$scriptVerts -jobs $minProcThreads -in $outSmpDir -out $outSmpDir`; if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outSmpDir/$vertRegi.$regiExt`} else {print STDERR "WARN: registry file $vertRegi not found\n"} `$scriptConls -jobs $procThreads -in $outSmpDir -out $outSmpDir` @@ -410,7 +413,7 @@ sub usage { &cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outVertDir/00README.txt"); if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outVertDir/$vertRegi.$regiExt`} else {print STDERR "WARN: registry file $vertRegi not found\n"} - `$scriptVerts -jobs $procThreads -in $outAnaDir -out $outVertDir`; + `$scriptVerts -jobs $minProcThreads -in $outAnaDir -out $outVertDir`; &dirify($outVertDir); } logger(); From 74181f700923ac537f888845569543a5c7864902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Fri, 7 Feb 2025 09:20:28 +0100 Subject: [PATCH 8/9] setting context in get-includes script with context-elements="parent elements names" (print all by default) --- Scripts/get-includes.xsl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Scripts/get-includes.xsl b/Scripts/get-includes.xsl index fa54d58e6..fa734ebb0 100644 --- a/Scripts/get-includes.xsl +++ b/Scripts/get-includes.xsl @@ -8,8 +8,17 @@ exclude-result-prefixes="#all" version="2.0"> + + + + - + From f80fbb8a07e55355bc8299751b30c4d4ec7389e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Kopp?= Date: Fri, 7 Feb 2025 15:09:18 +0100 Subject: [PATCH 9/9] factorize tei2meta out of text and conllu scripts #896 --- Scripts/parlamint2distro.pl | 18 +++++++- Scripts/parlamintp-tei2meta.pl | 76 ++++++++++++++++++++++++++++++++++ Scripts/parlamintp-tei2text.pl | 29 ------------- Scripts/parlamintp2conllu.pl | 22 ---------- 4 files changed, 92 insertions(+), 53 deletions(-) create mode 100755 Scripts/parlamintp-tei2meta.pl diff --git a/Scripts/parlamint2distro.pl b/Scripts/parlamint2distro.pl index a64510523..509ca23e0 100755 --- a/Scripts/parlamint2distro.pl +++ b/Scripts/parlamint2distro.pl @@ -179,6 +179,7 @@ sub usage { $scriptTexts = "$Bin/parlamintp-tei2text.pl"; $scriptVerts = "$Bin/parlamintp-tei2vert.pl"; $scriptConls = "$Bin/parlamintp2conllu.pl"; +$scriptMetas = "$Bin/parlamintp-tei2meta.pl"; $XX_template = "ParlaMint-XX"; @@ -333,13 +334,19 @@ sub usage { `rm -fr $outSmpDir; mkdir $outSmpDir`; if (-e $outTeiRoot) { `$Saxon outDir=$outSmpDir -xsl:$scriptSample $outTeiRoot`; + my $outTeiSmpRoot = File::Spec->catfile($outSmpDir, (File::Spec->splitpath($outTeiRoot))[2]); `$scriptTexts -jobs $procThreads -in $outSmpDir -out $outSmpDir`; + &dirify($outSmpDir); + `$scriptMetas -jobs $procThreads -inRoot $outTeiSmpRoot -out $outSmpDir`; } else {print STDERR "WARN: No TEI files for $countryCode samples (needed root file is $outTeiRoot)\n"} if (-e $outAnaRoot) { `$Saxon outDir=$outSmpDir -xsl:$scriptSample $outAnaRoot`; #Make also derived files + my $outAnaSmpRoot = File::Spec->catfile($outSmpDir, (File::Spec->splitpath($outAnaRoot))[2]); `$scriptTexts -jobs $procThreads -in $outSmpDir -out $outSmpDir` unless $outTeiRoot; + &dirify($outSmpDir); + `$scriptMetas -jobs $procThreads -inRoot $outAnaSmpRoot -out $outSmpDir` unless $outTeiRoot; `$scriptVerts -jobs $minProcThreads -in $outSmpDir -out $outSmpDir`; if (-e "$regiDir/$vertRegi") {`cp $regiDir/$vertRegi $outSmpDir/$vertRegi.$regiExt`} else {print STDERR "WARN: registry file $vertRegi not found\n"} @@ -385,8 +392,14 @@ sub usage { if ($MT) {$inReadme = "$docsDir/README-$MT.text.txt"} else {$inReadme = "$docsDir/README.text.txt"} &cp_readme($countryCode, $handleTxt, $Version, $inReadme, "$outTxtDir/00README.txt"); - if (-e $outTeiDir) {`$scriptTexts -jobs $procThreads -in $outTeiDir -out $outTxtDir`} - elsif (-e $outAnaDir) {`$scriptTexts -jobs $procThreads -in $outAnaDir -out $outTxtDir`} + if (-e $outTeiDir) { + `$scriptTexts -jobs $procThreads -in $outTeiDir -out $outTxtDir`; + `$scriptMetas -jobs $procThreads -inRoot $outTeiRoot -out $outTxtDir`; + } + elsif (-e $outAnaDir) { + `$scriptTexts -jobs $procThreads -in $outAnaDir -out $outTxtDir`; + `$scriptMetas -jobs $procThreads -in $outAnaRoot -out $outTxtDir`; + } else {die "FATAL ERROR: Neither $outTeiDir nor $outAnaDir exits\n"} &dirify($outTxtDir); } @@ -400,6 +413,7 @@ sub usage { else {$inReadme = "$docsDir/README.conll.txt"} &cp_readme($countryCode, $handleAna, $Version, $inReadme, "$outConlDir/00README.txt"); `$scriptConls -jobs $procThreads -in $outAnaDir -out $outConlDir`; + `$scriptMetas -jobs $procThreads -inRoot $outAnaRoot -out $outConlDir`; &dirify($outConlDir); } if (($procAll and $procVert) or (!$procAll and $procVert == 1)) { diff --git a/Scripts/parlamintp-tei2meta.pl b/Scripts/parlamintp-tei2meta.pl new file mode 100755 index 000000000..f7f9ec886 --- /dev/null +++ b/Scripts/parlamintp-tei2meta.pl @@ -0,0 +1,76 @@ +#!/usr/bin/env perl +use warnings; +use utf8; + +sub usage +{ + print STDERR ("Usage: parlamintp-tei2meta.pl -jobs -root -out \n"); + print STDERR (" Converts ParlaMint component files in the to\n"); + print STDERR (" -meta.tsv files in the \n"); + print STDERR (" using parallel in execution.\n"); +} + +use Getopt::Long; +use FindBin qw($Bin); +use File::Spec; +use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory + +my $tempdirroot = "$Bin/tmp"; +my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1); + +GetOptions + ( + 'help' => \$help, + 'inRoot=s' => \$inRoot, + 'out=s' => \$outDir, + 'jobs=i' => \$procThreads, +); + +if ($help) { + &usage; + exit; +} + +$inRoot = File::Spec->rel2abs($inRoot) if $inRoot; +$inDir = (File::Spec->splitpath($inRoot))[1]; +$outDir = File::Spec->rel2abs($outDir) if $outDir; +$procThreads = 1 unless $procThreads; + +binmode(STDERR, 'utf8'); + +$Para = "parallel --gnu --halt 0 --jobs $procThreads"; +$Saxon = "java -jar $Bin/bin/saxon.jar"; +$scriptMeta = "$Bin/parlamint2meta.xsl"; +$Includes = "$Bin/get-includes.xsl"; + +`rm -f $outDir/*-meta.tsv`; + +#Store all files to be processed in $fileFile +$fileFile = "$DIR/files.lst"; +`$Saxon -xsl:$Includes context-elements="teiCorpus" $inRoot | sed "s#^#$inDir/#" > $fileFile`; + +#Is this an MTed corpus? +$MT = $inDir =~ m/-en/; + +print STDERR "INFO: Making metadata files from component files in $inRoot\n"; +#For MTed corpora output only en metadata, for native, both xx and en +if ($MT) {@outLangs = ('en')} else {@outLangs = ('xx', 'en')} +# For orig corpora make ParlaMint-XX-meta.tsv in corpus language and ParlaMint-XX-meta-en.tsv in English +# For MTed corpora we produce ParlaMint-XX-en-meta.tsv in English +foreach my $outLang (@outLangs) { + my $outSuffix; + if ($MT and $outLang eq 'xx') {} + elsif ($MT and $outLang eq 'en') {$outSuffix = "-meta.tsv"} + elsif ($outLang eq 'xx') {$outSuffix = "-meta.tsv"} + elsif ($outLang eq 'en') {$outSuffix = "-meta-en.tsv"} + if ($outSuffix) { + $command = "$Saxon" . + " meta=$inRoot" . + " out-lang=$outLang" . + " -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix"; + `cat $fileFile | $Para '$command'`; + # The rm following looks like a bug, as no TSV files are left if we are processing only .ana! + #`rm -f $outDir/*.ana-meta.tsv`; + } +} +`rename 's/\.ana//' $outDir/*-meta*.tsv`; diff --git a/Scripts/parlamintp-tei2text.pl b/Scripts/parlamintp-tei2text.pl index fce8750b2..3bd2adcf6 100755 --- a/Scripts/parlamintp-tei2text.pl +++ b/Scripts/parlamintp-tei2text.pl @@ -38,11 +38,9 @@ sub usage binmode(STDERR, 'utf8'); $Para = "parallel --gnu --halt 0 --jobs $procThreads"; -$ParaLess = "parallel --gnu --halt 0 --jobs ".int($procThreads / 3); $Saxon = "java -jar $Bin/bin/saxon.jar"; -$scriptMeta = "$Bin/parlamint2meta.xsl"; $scriptText = "$Bin/parlamint-tei2text.xsl"; print STDERR "INFO: Converting directory $inDir\n"; @@ -50,8 +48,6 @@ sub usage #Store all files to be processed in $fileFile $fileFile = "$DIR/files.lst"; $corpusFiles = "$inDir/*_*.xml $inDir/*/*_*.xml"; -#Is this an MTed corpus? -$MT = $inDir =~ m/-en/; #We can convert either plain files or .ana files open(TMP, '>:utf8', $fileFile); @@ -75,28 +71,3 @@ sub usage `cat $fileFile | $Para '$command'`; `rename 's/\.ana//' $outDir/*.txt`; -print STDERR "INFO: Making metadata files\n"; -opendir(CORPUSDIR, $inDir); -@rootFile = grep {/ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?(\.ana)?\.xml$/} readdir(CORPUSDIR); -closedir(CORPUSDIR); -#For MTed corpora output only en metadata, for native, both xx and en -if ($MT) {@outLangs = ('en')} else {@outLangs = ('xx', 'en')} -# For orig corpora make ParlaMint-XX-meta.tsv in corpus language and ParlaMint-XX-meta-en.tsv in English -# For MTed corpora we produce ParlaMint-XX-en-meta.tsv in English -foreach my $outLang (@outLangs) { - my $outSuffix; - if ($MT and $outLang eq 'xx') {} - elsif ($MT and $outLang eq 'en') {$outSuffix = "-meta.tsv"} - elsif ($outLang eq 'xx') {$outSuffix = "-meta.tsv"} - elsif ($outLang eq 'en') {$outSuffix = "-meta-en.tsv"} - if ($outSuffix) { - $command = "$Saxon" . - " meta=" . File::Spec->catfile($inDir,$rootFile[0]) . - " out-lang=$outLang" . - " -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix"; - `cat $fileFile | $ParaLess '$command'`; - # The rm following looks like a bug, as no TSV files are left if we are processing only .ana! - #`rm -f $outDir/*.ana-meta.tsv`; - } -} -`rename 's/\.ana//' $outDir/*-meta*.tsv`; diff --git a/Scripts/parlamintp2conllu.pl b/Scripts/parlamintp2conllu.pl index b83cad6c2..b2d887740 100755 --- a/Scripts/parlamintp2conllu.pl +++ b/Scripts/parlamintp2conllu.pl @@ -47,13 +47,11 @@ sub usage $procThreads = 1 unless $procThreads; $Para = "parallel --gnu --halt 0 --jobs $procThreads"; -$ParaLess = "parallel --gnu --halt 0 --jobs ".int($procThreads / 3); $Saxon = "java -jar $Bin/bin/saxon.jar"; $scriptValid = "$Bin/bin/tools/validate.py"; $scriptConvert = "$Bin/parlamint2conllu.xsl"; -$scriptMeta = "$Bin/parlamint2meta.xsl"; #This should be somehow factorised out!! $country2lang{'AT'} = 'de'; @@ -122,28 +120,8 @@ sub usage close TMP; `mkdir $outDir` unless -e "$outDir"; -`rm -f $outDir/*-meta.tsv`; `rm -f $outDir/*.conllu`; -#For MTed corpora output only en metadata, for native, both xx and en -if ($MT) {@outLangs = ('en')} else {@outLangs = ('xx', 'en')} -# For orig corpora make ParlaMint-XX-meta.tsv in corpus language and ParlaMint-XX-meta-en.tsv in English -# For MTed corpora we produce ParlaMint-XX-en-meta.tsv in English -foreach my $outLang (@outLangs) { - my $outSuffix; - if ($MT and $outLang eq 'xx') {} - elsif ($MT and $outLang eq 'en') {$outSuffix = "-meta.tsv"} - elsif ($outLang eq 'xx') {$outSuffix = "-meta.tsv"} - elsif ($outLang eq 'en') {$outSuffix = "-meta-en.tsv"} - if ($outSuffix) { - $command = "$Saxon meta=$rootAnaFile" . - " out-lang=$outLang" . - " -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix"; - `cat $fileFile | $ParaLess '$command'`; - } -} -`rename 's/\.ana//' $outDir/*-meta*.tsv`; - # Produce common CoNLL-U, even if we have more languages in a corpus if ($langs !~ /,/) {$checkLang = $langs} else {($checkLang) = $langs =~ /(.+?),/}